[PATCHv3 10/15] mm/hugetlb: Remove fake head pages

Kiryl Shutsemau posted 15 patches 3 weeks, 2 days ago
There is a newer version of this series
[PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by Kiryl Shutsemau 3 weeks, 2 days ago
HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
vmemmap pages for huge pages and remapping the freed range to a single
page containing the struct page metadata.

With the new mask-based compound_info encoding (for power-of-2 struct
page sizes), all tail pages of the same order are now identical
regardless of which compound page they belong to. This means the tail
pages can be truly shared without fake heads.

Allocate a single page of initialized tail struct pages per NUMA node
per order in the vmemmap_tails[] array in pglist_data. All huge pages
of that order on the node share this tail page, mapped read-only into
their vmemmap. The head page remains unique per huge page.

This eliminates fake heads while maintaining the same memory savings,
and simplifies compound_head() by removing fake head detection.

Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
---
 include/linux/mmzone.h | 16 ++++++++++++++-
 mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
 mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
 3 files changed, 93 insertions(+), 11 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 322ed4c42cfc..2ee3eb610291 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -82,7 +82,11 @@
  * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
  * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
  */
-#define MAX_FOLIO_ORDER		get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
+#ifdef CONFIG_64BIT
+#define MAX_FOLIO_ORDER		(34 - PAGE_SHIFT)
+#else
+#define MAX_FOLIO_ORDER		(30 - PAGE_SHIFT)
+#endif
 #else
 /*
  * Without hugetlb, gigantic folios that are bigger than a single PUD are
@@ -1408,6 +1412,13 @@ struct memory_failure_stats {
 };
 #endif
 
+/*
+ * vmemmap optimization (like HVO) is only possible for page orders that fill
+ * two or more pages with struct pages.
+ */
+#define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page)))
+#define NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1)
+
 /*
  * On NUMA machines, each NUMA node would have a pg_data_t to describe
  * it's memory layout. On UMA machines there is a single pglist_data which
@@ -1556,6 +1567,9 @@ typedef struct pglist_data {
 #ifdef CONFIG_MEMORY_FAILURE
 	struct memory_failure_stats mf_stats;
 #endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	unsigned long vmemmap_tails[NR_VMEMMAP_TAILS];
+#endif
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 2b19c2205091..cbdca4684db1 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -18,6 +18,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "hugetlb_vmemmap.h"
+#include "internal.h"
 
 /**
  * struct vmemmap_remap_walk - walk vmemmap page table
@@ -517,6 +518,41 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *
 	return true;
 }
 
+static struct page *vmemmap_get_tail(unsigned int order, int node)
+{
+	unsigned long pfn;
+	unsigned int idx;
+	struct page *tail, *p;
+
+	idx = order - VMEMMAP_TAIL_MIN_ORDER;
+	pfn =  NODE_DATA(node)->vmemmap_tails[idx];
+	if (pfn)
+		return pfn_to_page(pfn);
+
+	tail = alloc_pages_node(node, GFP_KERNEL, 0);
+	if (!tail)
+		return NULL;
+
+	p = page_to_virt(tail);
+	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
+		prep_compound_tail(p + i, NULL, order);
+
+	spin_lock(&hugetlb_lock);
+	if (!NODE_DATA(node)->vmemmap_tails[idx]) {
+		pfn = PHYS_PFN(virt_to_phys(p));
+		NODE_DATA(node)->vmemmap_tails[idx] = pfn;
+		tail = NULL;
+	} else {
+		pfn = NODE_DATA(node)->vmemmap_tails[idx];
+	}
+	spin_unlock(&hugetlb_lock);
+
+	if (tail)
+		__free_page(tail);
+
+	return pfn_to_page(pfn);
+}
+
 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 					    struct folio *folio,
 					    struct list_head *vmemmap_pages,
@@ -532,6 +568,12 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 	if (!vmemmap_should_optimize_folio(h, folio))
 		return ret;
 
+	nid = folio_nid(folio);
+
+	vmemmap_tail = vmemmap_get_tail(h->order, nid);
+	if (!vmemmap_tail)
+		return -ENOMEM;
+
 	static_branch_inc(&hugetlb_optimize_vmemmap_key);
 
 	if (flags & VMEMMAP_SYNCHRONIZE_RCU)
@@ -549,7 +591,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 	 */
 	folio_set_hugetlb_vmemmap_optimized(folio);
 
-	nid = folio_nid(folio);
 	vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0);
 
 	if (!vmemmap_head) {
@@ -561,7 +602,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 	list_add(&vmemmap_head->lru, vmemmap_pages);
 	memmap_pages_add(1);
 
-	vmemmap_tail	= vmemmap_head;
 	vmemmap_start	= (unsigned long)folio;
 	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
 
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index dbd8daccade2..94b4e90fa00f 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -378,16 +378,45 @@ void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
 	}
 }
 
-/*
- * Populate vmemmap pages HVO-style. The first page contains the head
- * page and needed tail pages, the other ones are mirrors of the first
- * page.
- */
+static __meminit unsigned long vmemmap_get_tail(unsigned int order, int node)
+{
+	unsigned long pfn;
+	unsigned int idx;
+	struct page *p;
+
+	BUG_ON(order < VMEMMAP_TAIL_MIN_ORDER);
+	BUG_ON(order > MAX_FOLIO_ORDER);
+
+	idx = order - VMEMMAP_TAIL_MIN_ORDER;
+	pfn =  NODE_DATA(node)->vmemmap_tails[idx];
+	if (pfn)
+		return pfn;
+
+	p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+	if (!p)
+		return 0;
+
+	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
+		prep_compound_tail(p + i, NULL, order);
+
+	pfn = PHYS_PFN(virt_to_phys(p));
+	NODE_DATA(node)->vmemmap_tails[idx] = pfn;
+
+	return pfn;
+}
+
 int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
 				       int node, unsigned long headsize)
 {
+	unsigned long maddr, len, tail_pfn;
+	unsigned int order;
 	pte_t *pte;
-	unsigned long maddr;
+
+	len = end - addr;
+	order = ilog2(len * sizeof(struct page) / PAGE_SIZE);
+	tail_pfn = vmemmap_get_tail(order, node);
+	if (!tail_pfn)
+		return -ENOMEM;
 
 	for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
 		pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
@@ -398,8 +427,7 @@ int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
 	/*
 	 * Reuse the last page struct page mapped above for the rest.
 	 */
-	return vmemmap_populate_range(maddr, end, node, NULL,
-					pte_pfn(ptep_get(pte)), 0);
+	return vmemmap_populate_range(maddr, end, node, NULL, tail_pfn, 0);
 }
 
 void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
-- 
2.51.2
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by David Hildenbrand (Red Hat) 3 weeks, 2 days ago
On 1/15/26 15:45, Kiryl Shutsemau wrote:
> HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
> vmemmap pages for huge pages and remapping the freed range to a single
> page containing the struct page metadata.
> 
> With the new mask-based compound_info encoding (for power-of-2 struct
> page sizes), all tail pages of the same order are now identical
> regardless of which compound page they belong to. This means the tail
> pages can be truly shared without fake heads.
> 
> Allocate a single page of initialized tail struct pages per NUMA node
> per order in the vmemmap_tails[] array in pglist_data. All huge pages
> of that order on the node share this tail page, mapped read-only into
> their vmemmap. The head page remains unique per huge page.
> 
> This eliminates fake heads while maintaining the same memory savings,
> and simplifies compound_head() by removing fake head detection.
> 
> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
> ---
>   include/linux/mmzone.h | 16 ++++++++++++++-
>   mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
>   mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
>   3 files changed, 93 insertions(+), 11 deletions(-)
> 
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 322ed4c42cfc..2ee3eb610291 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -82,7 +82,11 @@
>    * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
>    * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
>    */
> -#define MAX_FOLIO_ORDER		get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
> +#ifdef CONFIG_64BIT
> +#define MAX_FOLIO_ORDER		(34 - PAGE_SHIFT)
> +#else
> +#define MAX_FOLIO_ORDER		(30 - PAGE_SHIFT)
> +#endif

Where do these magic values stem from, and how do they related to the 
comment above that clearly spells out 16G vs. 1G ?


-- 
Cheers

David
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by Kiryl Shutsemau 3 weeks, 2 days ago
On Thu, Jan 15, 2026 at 05:49:43PM +0100, David Hildenbrand (Red Hat) wrote:
> On 1/15/26 15:45, Kiryl Shutsemau wrote:
> > HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
> > vmemmap pages for huge pages and remapping the freed range to a single
> > page containing the struct page metadata.
> > 
> > With the new mask-based compound_info encoding (for power-of-2 struct
> > page sizes), all tail pages of the same order are now identical
> > regardless of which compound page they belong to. This means the tail
> > pages can be truly shared without fake heads.
> > 
> > Allocate a single page of initialized tail struct pages per NUMA node
> > per order in the vmemmap_tails[] array in pglist_data. All huge pages
> > of that order on the node share this tail page, mapped read-only into
> > their vmemmap. The head page remains unique per huge page.
> > 
> > This eliminates fake heads while maintaining the same memory savings,
> > and simplifies compound_head() by removing fake head detection.
> > 
> > Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
> > ---
> >   include/linux/mmzone.h | 16 ++++++++++++++-
> >   mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
> >   mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
> >   3 files changed, 93 insertions(+), 11 deletions(-)
> > 
> > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > index 322ed4c42cfc..2ee3eb610291 100644
> > --- a/include/linux/mmzone.h
> > +++ b/include/linux/mmzone.h
> > @@ -82,7 +82,11 @@
> >    * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
> >    * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
> >    */
> > -#define MAX_FOLIO_ORDER		get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
> > +#ifdef CONFIG_64BIT
> > +#define MAX_FOLIO_ORDER		(34 - PAGE_SHIFT)
> > +#else
> > +#define MAX_FOLIO_ORDER		(30 - PAGE_SHIFT)
> > +#endif
> 
> Where do these magic values stem from, and how do they related to the
> comment above that clearly spells out 16G vs. 1G ?

This doesn't change the resulting value: 1UL << 34 is 16GiB, 1UL << 30
is 1G. Subtract PAGE_SHIFT to get the order.

The change allows the value to be used to define NR_VMEMMAP_TAILS which
is used specify size of vmemmap_tails array.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by Muchun Song 3 weeks, 1 day ago

> On Jan 16, 2026, at 01:23, Kiryl Shutsemau <kas@kernel.org> wrote:
> 
> On Thu, Jan 15, 2026 at 05:49:43PM +0100, David Hildenbrand (Red Hat) wrote:
>> On 1/15/26 15:45, Kiryl Shutsemau wrote:
>>> HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
>>> vmemmap pages for huge pages and remapping the freed range to a single
>>> page containing the struct page metadata.
>>> 
>>> With the new mask-based compound_info encoding (for power-of-2 struct
>>> page sizes), all tail pages of the same order are now identical
>>> regardless of which compound page they belong to. This means the tail
>>> pages can be truly shared without fake heads.
>>> 
>>> Allocate a single page of initialized tail struct pages per NUMA node
>>> per order in the vmemmap_tails[] array in pglist_data. All huge pages
>>> of that order on the node share this tail page, mapped read-only into
>>> their vmemmap. The head page remains unique per huge page.
>>> 
>>> This eliminates fake heads while maintaining the same memory savings,
>>> and simplifies compound_head() by removing fake head detection.
>>> 
>>> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
>>> ---
>>>  include/linux/mmzone.h | 16 ++++++++++++++-
>>>  mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
>>>  mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
>>>  3 files changed, 93 insertions(+), 11 deletions(-)
>>> 
>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>> index 322ed4c42cfc..2ee3eb610291 100644
>>> --- a/include/linux/mmzone.h
>>> +++ b/include/linux/mmzone.h
>>> @@ -82,7 +82,11 @@
>>>   * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
>>>   * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
>>>   */
>>> -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
>>> +#ifdef CONFIG_64BIT
>>> +#define MAX_FOLIO_ORDER (34 - PAGE_SHIFT)
>>> +#else
>>> +#define MAX_FOLIO_ORDER (30 - PAGE_SHIFT)
>>> +#endif
>> 
>> Where do these magic values stem from, and how do they related to the
>> comment above that clearly spells out 16G vs. 1G ?
> 
> This doesn't change the resulting value: 1UL << 34 is 16GiB, 1UL << 30
> is 1G. Subtract PAGE_SHIFT to get the order.
> 
> The change allows the value to be used to define NR_VMEMMAP_TAILS which
> is used specify size of vmemmap_tails array.

How about allocate ->vmemmap_tails array dynamically? If sizeof of struct
page is not power of two, then we could optimize away this array. Besides,
the original MAX_FOLIO_ORDER could work as well.

> 
> -- 
>  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by Kiryl Shutsemau 3 weeks, 1 day ago
On Fri, Jan 16, 2026 at 10:38:02AM +0800, Muchun Song wrote:
> 
> 
> > On Jan 16, 2026, at 01:23, Kiryl Shutsemau <kas@kernel.org> wrote:
> > 
> > On Thu, Jan 15, 2026 at 05:49:43PM +0100, David Hildenbrand (Red Hat) wrote:
> >> On 1/15/26 15:45, Kiryl Shutsemau wrote:
> >>> HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
> >>> vmemmap pages for huge pages and remapping the freed range to a single
> >>> page containing the struct page metadata.
> >>> 
> >>> With the new mask-based compound_info encoding (for power-of-2 struct
> >>> page sizes), all tail pages of the same order are now identical
> >>> regardless of which compound page they belong to. This means the tail
> >>> pages can be truly shared without fake heads.
> >>> 
> >>> Allocate a single page of initialized tail struct pages per NUMA node
> >>> per order in the vmemmap_tails[] array in pglist_data. All huge pages
> >>> of that order on the node share this tail page, mapped read-only into
> >>> their vmemmap. The head page remains unique per huge page.
> >>> 
> >>> This eliminates fake heads while maintaining the same memory savings,
> >>> and simplifies compound_head() by removing fake head detection.
> >>> 
> >>> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
> >>> ---
> >>>  include/linux/mmzone.h | 16 ++++++++++++++-
> >>>  mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
> >>>  mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
> >>>  3 files changed, 93 insertions(+), 11 deletions(-)
> >>> 
> >>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >>> index 322ed4c42cfc..2ee3eb610291 100644
> >>> --- a/include/linux/mmzone.h
> >>> +++ b/include/linux/mmzone.h
> >>> @@ -82,7 +82,11 @@
> >>>   * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
> >>>   * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
> >>>   */
> >>> -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
> >>> +#ifdef CONFIG_64BIT
> >>> +#define MAX_FOLIO_ORDER (34 - PAGE_SHIFT)
> >>> +#else
> >>> +#define MAX_FOLIO_ORDER (30 - PAGE_SHIFT)
> >>> +#endif
> >> 
> >> Where do these magic values stem from, and how do they related to the
> >> comment above that clearly spells out 16G vs. 1G ?
> > 
> > This doesn't change the resulting value: 1UL << 34 is 16GiB, 1UL << 30
> > is 1G. Subtract PAGE_SHIFT to get the order.
> > 
> > The change allows the value to be used to define NR_VMEMMAP_TAILS which
> > is used specify size of vmemmap_tails array.
> 
> How about allocate ->vmemmap_tails array dynamically? If sizeof of struct
> page is not power of two, then we could optimize away this array. Besides,
> the original MAX_FOLIO_ORDER could work as well.

This is tricky.

We need vmemmap_tails array to be around early, in
hugetlb_vmemmap_init_early(). By the time, we don't have slab
functional yet.

I think getting the array compile-time is the best shot.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by Muchun Song 3 weeks ago

> On Jan 16, 2026, at 23:52, Kiryl Shutsemau <kas@kernel.org> wrote:
> 
> On Fri, Jan 16, 2026 at 10:38:02AM +0800, Muchun Song wrote:
>> 
>> 
>>> On Jan 16, 2026, at 01:23, Kiryl Shutsemau <kas@kernel.org> wrote:
>>> 
>>> On Thu, Jan 15, 2026 at 05:49:43PM +0100, David Hildenbrand (Red Hat) wrote:
>>>> On 1/15/26 15:45, Kiryl Shutsemau wrote:
>>>>> HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
>>>>> vmemmap pages for huge pages and remapping the freed range to a single
>>>>> page containing the struct page metadata.
>>>>> 
>>>>> With the new mask-based compound_info encoding (for power-of-2 struct
>>>>> page sizes), all tail pages of the same order are now identical
>>>>> regardless of which compound page they belong to. This means the tail
>>>>> pages can be truly shared without fake heads.
>>>>> 
>>>>> Allocate a single page of initialized tail struct pages per NUMA node
>>>>> per order in the vmemmap_tails[] array in pglist_data. All huge pages
>>>>> of that order on the node share this tail page, mapped read-only into
>>>>> their vmemmap. The head page remains unique per huge page.
>>>>> 
>>>>> This eliminates fake heads while maintaining the same memory savings,
>>>>> and simplifies compound_head() by removing fake head detection.
>>>>> 
>>>>> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
>>>>> ---
>>>>> include/linux/mmzone.h | 16 ++++++++++++++-
>>>>> mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
>>>>> mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
>>>>> 3 files changed, 93 insertions(+), 11 deletions(-)
>>>>> 
>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>> index 322ed4c42cfc..2ee3eb610291 100644
>>>>> --- a/include/linux/mmzone.h
>>>>> +++ b/include/linux/mmzone.h
>>>>> @@ -82,7 +82,11 @@
>>>>>  * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
>>>>>  * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
>>>>>  */
>>>>> -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
>>>>> +#ifdef CONFIG_64BIT
>>>>> +#define MAX_FOLIO_ORDER (34 - PAGE_SHIFT)
>>>>> +#else
>>>>> +#define MAX_FOLIO_ORDER (30 - PAGE_SHIFT)
>>>>> +#endif
>>>> 
>>>> Where do these magic values stem from, and how do they related to the
>>>> comment above that clearly spells out 16G vs. 1G ?
>>> 
>>> This doesn't change the resulting value: 1UL << 34 is 16GiB, 1UL << 30
>>> is 1G. Subtract PAGE_SHIFT to get the order.
>>> 
>>> The change allows the value to be used to define NR_VMEMMAP_TAILS which
>>> is used specify size of vmemmap_tails array.
>> 
>> How about allocate ->vmemmap_tails array dynamically? If sizeof of struct
>> page is not power of two, then we could optimize away this array. Besides,
>> the original MAX_FOLIO_ORDER could work as well.
> 
> This is tricky.
> 
> We need vmemmap_tails array to be around early, in
> hugetlb_vmemmap_init_early(). By the time, we don't have slab
> functional yet.

I mean zero-size array at the end of pg_data_t, no slab is needed.

> 
> I think getting the array compile-time is the best shot.
> 
> -- 
>  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by Kiryl Shutsemau 2 weeks, 5 days ago
On Sat, Jan 17, 2026 at 10:38:48AM +0800, Muchun Song wrote:
> 
> 
> > On Jan 16, 2026, at 23:52, Kiryl Shutsemau <kas@kernel.org> wrote:
> > 
> > On Fri, Jan 16, 2026 at 10:38:02AM +0800, Muchun Song wrote:
> >> 
> >> 
> >>> On Jan 16, 2026, at 01:23, Kiryl Shutsemau <kas@kernel.org> wrote:
> >>> 
> >>> On Thu, Jan 15, 2026 at 05:49:43PM +0100, David Hildenbrand (Red Hat) wrote:
> >>>> On 1/15/26 15:45, Kiryl Shutsemau wrote:
> >>>>> HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
> >>>>> vmemmap pages for huge pages and remapping the freed range to a single
> >>>>> page containing the struct page metadata.
> >>>>> 
> >>>>> With the new mask-based compound_info encoding (for power-of-2 struct
> >>>>> page sizes), all tail pages of the same order are now identical
> >>>>> regardless of which compound page they belong to. This means the tail
> >>>>> pages can be truly shared without fake heads.
> >>>>> 
> >>>>> Allocate a single page of initialized tail struct pages per NUMA node
> >>>>> per order in the vmemmap_tails[] array in pglist_data. All huge pages
> >>>>> of that order on the node share this tail page, mapped read-only into
> >>>>> their vmemmap. The head page remains unique per huge page.
> >>>>> 
> >>>>> This eliminates fake heads while maintaining the same memory savings,
> >>>>> and simplifies compound_head() by removing fake head detection.
> >>>>> 
> >>>>> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
> >>>>> ---
> >>>>> include/linux/mmzone.h | 16 ++++++++++++++-
> >>>>> mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
> >>>>> mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
> >>>>> 3 files changed, 93 insertions(+), 11 deletions(-)
> >>>>> 
> >>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> >>>>> index 322ed4c42cfc..2ee3eb610291 100644
> >>>>> --- a/include/linux/mmzone.h
> >>>>> +++ b/include/linux/mmzone.h
> >>>>> @@ -82,7 +82,11 @@
> >>>>>  * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
> >>>>>  * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
> >>>>>  */
> >>>>> -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
> >>>>> +#ifdef CONFIG_64BIT
> >>>>> +#define MAX_FOLIO_ORDER (34 - PAGE_SHIFT)
> >>>>> +#else
> >>>>> +#define MAX_FOLIO_ORDER (30 - PAGE_SHIFT)
> >>>>> +#endif
> >>>> 
> >>>> Where do these magic values stem from, and how do they related to the
> >>>> comment above that clearly spells out 16G vs. 1G ?
> >>> 
> >>> This doesn't change the resulting value: 1UL << 34 is 16GiB, 1UL << 30
> >>> is 1G. Subtract PAGE_SHIFT to get the order.
> >>> 
> >>> The change allows the value to be used to define NR_VMEMMAP_TAILS which
> >>> is used specify size of vmemmap_tails array.
> >> 
> >> How about allocate ->vmemmap_tails array dynamically? If sizeof of struct
> >> page is not power of two, then we could optimize away this array. Besides,
> >> the original MAX_FOLIO_ORDER could work as well.
> > 
> > This is tricky.
> > 
> > We need vmemmap_tails array to be around early, in
> > hugetlb_vmemmap_init_early(). By the time, we don't have slab
> > functional yet.
> 
> I mean zero-size array at the end of pg_data_t, no slab is needed.

For !NUMA, the struct is in BSS. See contig_page_data.

Dynamic array won't fly there.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by Muchun Song 2 weeks, 4 days ago

> On Jan 19, 2026, at 23:15, Kiryl Shutsemau <kas@kernel.org> wrote:
> 
> On Sat, Jan 17, 2026 at 10:38:48AM +0800, Muchun Song wrote:
>> 
>> 
>>> On Jan 16, 2026, at 23:52, Kiryl Shutsemau <kas@kernel.org> wrote:
>>> 
>>> On Fri, Jan 16, 2026 at 10:38:02AM +0800, Muchun Song wrote:
>>>> 
>>>> 
>>>>> On Jan 16, 2026, at 01:23, Kiryl Shutsemau <kas@kernel.org> wrote:
>>>>> 
>>>>> On Thu, Jan 15, 2026 at 05:49:43PM +0100, David Hildenbrand (Red Hat) wrote:
>>>>>> On 1/15/26 15:45, Kiryl Shutsemau wrote:
>>>>>>> HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
>>>>>>> vmemmap pages for huge pages and remapping the freed range to a single
>>>>>>> page containing the struct page metadata.
>>>>>>> 
>>>>>>> With the new mask-based compound_info encoding (for power-of-2 struct
>>>>>>> page sizes), all tail pages of the same order are now identical
>>>>>>> regardless of which compound page they belong to. This means the tail
>>>>>>> pages can be truly shared without fake heads.
>>>>>>> 
>>>>>>> Allocate a single page of initialized tail struct pages per NUMA node
>>>>>>> per order in the vmemmap_tails[] array in pglist_data. All huge pages
>>>>>>> of that order on the node share this tail page, mapped read-only into
>>>>>>> their vmemmap. The head page remains unique per huge page.
>>>>>>> 
>>>>>>> This eliminates fake heads while maintaining the same memory savings,
>>>>>>> and simplifies compound_head() by removing fake head detection.
>>>>>>> 
>>>>>>> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
>>>>>>> ---
>>>>>>> include/linux/mmzone.h | 16 ++++++++++++++-
>>>>>>> mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
>>>>>>> mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
>>>>>>> 3 files changed, 93 insertions(+), 11 deletions(-)
>>>>>>> 
>>>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>>>> index 322ed4c42cfc..2ee3eb610291 100644
>>>>>>> --- a/include/linux/mmzone.h
>>>>>>> +++ b/include/linux/mmzone.h
>>>>>>> @@ -82,7 +82,11 @@
>>>>>>> * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
>>>>>>> * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
>>>>>>> */
>>>>>>> -#define MAX_FOLIO_ORDER get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
>>>>>>> +#ifdef CONFIG_64BIT
>>>>>>> +#define MAX_FOLIO_ORDER (34 - PAGE_SHIFT)
>>>>>>> +#else
>>>>>>> +#define MAX_FOLIO_ORDER (30 - PAGE_SHIFT)
>>>>>>> +#endif
>>>>>> 
>>>>>> Where do these magic values stem from, and how do they related to the
>>>>>> comment above that clearly spells out 16G vs. 1G ?
>>>>> 
>>>>> This doesn't change the resulting value: 1UL << 34 is 16GiB, 1UL << 30
>>>>> is 1G. Subtract PAGE_SHIFT to get the order.
>>>>> 
>>>>> The change allows the value to be used to define NR_VMEMMAP_TAILS which
>>>>> is used specify size of vmemmap_tails array.
>>>> 
>>>> How about allocate ->vmemmap_tails array dynamically? If sizeof of struct
>>>> page is not power of two, then we could optimize away this array. Besides,
>>>> the original MAX_FOLIO_ORDER could work as well.
>>> 
>>> This is tricky.
>>> 
>>> We need vmemmap_tails array to be around early, in
>>> hugetlb_vmemmap_init_early(). By the time, we don't have slab
>>> functional yet.
>> 
>> I mean zero-size array at the end of pg_data_t, no slab is needed.
> 
> For !NUMA, the struct is in BSS. See contig_page_data.

Right. I missed that.

> 
> Dynamic array won't fly there.
> 
> -- 
>  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by David Hildenbrand (Red Hat) 3 weeks, 2 days ago
On 1/15/26 18:23, Kiryl Shutsemau wrote:
> On Thu, Jan 15, 2026 at 05:49:43PM +0100, David Hildenbrand (Red Hat) wrote:
>> On 1/15/26 15:45, Kiryl Shutsemau wrote:
>>> HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
>>> vmemmap pages for huge pages and remapping the freed range to a single
>>> page containing the struct page metadata.
>>>
>>> With the new mask-based compound_info encoding (for power-of-2 struct
>>> page sizes), all tail pages of the same order are now identical
>>> regardless of which compound page they belong to. This means the tail
>>> pages can be truly shared without fake heads.
>>>
>>> Allocate a single page of initialized tail struct pages per NUMA node
>>> per order in the vmemmap_tails[] array in pglist_data. All huge pages
>>> of that order on the node share this tail page, mapped read-only into
>>> their vmemmap. The head page remains unique per huge page.
>>>
>>> This eliminates fake heads while maintaining the same memory savings,
>>> and simplifies compound_head() by removing fake head detection.
>>>
>>> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
>>> ---
>>>    include/linux/mmzone.h | 16 ++++++++++++++-
>>>    mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
>>>    mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
>>>    3 files changed, 93 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>> index 322ed4c42cfc..2ee3eb610291 100644
>>> --- a/include/linux/mmzone.h
>>> +++ b/include/linux/mmzone.h
>>> @@ -82,7 +82,11 @@
>>>     * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
>>>     * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
>>>     */
>>> -#define MAX_FOLIO_ORDER		get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
>>> +#ifdef CONFIG_64BIT
>>> +#define MAX_FOLIO_ORDER		(34 - PAGE_SHIFT)
>>> +#else
>>> +#define MAX_FOLIO_ORDER		(30 - PAGE_SHIFT)
>>> +#endif
>>
>> Where do these magic values stem from, and how do they related to the
>> comment above that clearly spells out 16G vs. 1G ?
> 
> This doesn't change the resulting value: 1UL << 34 is 16GiB, 1UL << 30
> is 1G. Subtract PAGE_SHIFT to get the order.
> 
> The change allows the value to be used to define NR_VMEMMAP_TAILS which
> is used specify size of vmemmap_tails array.

get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) should evaluate to 
a constant by the compiler.

See __builtin_constant_p handling in get_order().

If that is not working then we have to figure out why.

Was this only a specific config in where you ran into compile-time problems?

-- 
Cheers

David
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by Kiryl Shutsemau 3 weeks, 1 day ago
On Thu, Jan 15, 2026 at 06:41:44PM +0100, David Hildenbrand (Red Hat) wrote:
> On 1/15/26 18:23, Kiryl Shutsemau wrote:
> > On Thu, Jan 15, 2026 at 05:49:43PM +0100, David Hildenbrand (Red Hat) wrote:
> > > On 1/15/26 15:45, Kiryl Shutsemau wrote:
> > > > HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
> > > > vmemmap pages for huge pages and remapping the freed range to a single
> > > > page containing the struct page metadata.
> > > > 
> > > > With the new mask-based compound_info encoding (for power-of-2 struct
> > > > page sizes), all tail pages of the same order are now identical
> > > > regardless of which compound page they belong to. This means the tail
> > > > pages can be truly shared without fake heads.
> > > > 
> > > > Allocate a single page of initialized tail struct pages per NUMA node
> > > > per order in the vmemmap_tails[] array in pglist_data. All huge pages
> > > > of that order on the node share this tail page, mapped read-only into
> > > > their vmemmap. The head page remains unique per huge page.
> > > > 
> > > > This eliminates fake heads while maintaining the same memory savings,
> > > > and simplifies compound_head() by removing fake head detection.
> > > > 
> > > > Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
> > > > ---
> > > >    include/linux/mmzone.h | 16 ++++++++++++++-
> > > >    mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
> > > >    mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
> > > >    3 files changed, 93 insertions(+), 11 deletions(-)
> > > > 
> > > > diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> > > > index 322ed4c42cfc..2ee3eb610291 100644
> > > > --- a/include/linux/mmzone.h
> > > > +++ b/include/linux/mmzone.h
> > > > @@ -82,7 +82,11 @@
> > > >     * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
> > > >     * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
> > > >     */
> > > > -#define MAX_FOLIO_ORDER		get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
> > > > +#ifdef CONFIG_64BIT
> > > > +#define MAX_FOLIO_ORDER		(34 - PAGE_SHIFT)
> > > > +#else
> > > > +#define MAX_FOLIO_ORDER		(30 - PAGE_SHIFT)
> > > > +#endif
> > > 
> > > Where do these magic values stem from, and how do they related to the
> > > comment above that clearly spells out 16G vs. 1G ?
> > 
> > This doesn't change the resulting value: 1UL << 34 is 16GiB, 1UL << 30
> > is 1G. Subtract PAGE_SHIFT to get the order.
> > 
> > The change allows the value to be used to define NR_VMEMMAP_TAILS which
> > is used specify size of vmemmap_tails array.
> 
> get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) should evaluate to a
> constant by the compiler.
>
> See __builtin_constant_p handling in get_order().
> 
> If that is not working then we have to figure out why.

asm-offsets.s compilation fails:

../include/linux/mmzone.h:1574:16: error: fields must have a constant size:
      'variable length array in structure' extension will never be supported
 1574 |         unsigned long vmemmap_tails[NR_VMEMMAP_TAILS];

Here's how preprocessor dump of vmemmap_tails looks like:

 unsigned long vmemmap_tails[(get_order(1 ? (0x400000000ULL) : 0x40000000) - (( __builtin_constant_p(2 * ((1UL) << 12) / sizeof(struct page)) ? ((2 * ((1UL) << 12) / sizeof(struct page)) < 2 ? 0 : 63 - __builtin_clzll(2 * ((1UL) << 12) / sizeof(struct page))) : (sizeof(2 * ((1UL) << 12) / sizeof(struct page)) <= 4) ? __ilog2_u32(2 * ((1UL) << 12) / sizeof(struct page)) : __ilog2_u64(2 * ((1UL) << 12) / sizeof(struct page)) )) + 1)];

And here's get_order():

static inline __attribute__((__gnu_inline__)) __attribute__((__unused__)) __attribute__((no_instrument_function)) __attribute__((__always_inline__)) __attribute__((__const__)) int get_order(unsigned long size)
{
 if (__builtin_constant_p(size)) {
  if (!size)
   return 64 - 12;

  if (size < (1UL << 12))
   return 0;

  return ( __builtin_constant_p((size) - 1) ? (((size) - 1) < 2 ? 0 : 63 - __builtin_clzll((size) - 1)) : (sizeof((size) - 1) <= 4) ? __ilog2_u32((size) - 1) : __ilog2_u64((size) - 1) ) - 12 + 1;
 }

 size--;
 size >>= 12;



 return fls64(size);

}

I am not sure why it is not compile-time constant. I have not dig
deeper.

Switching to ilog2(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) - PAGE_SHIFT works,
but I personally find my variant more readable.

Do you want me to dig deeper to check if making get_order() work
possible?

> Was this only a specific config in where you ran into compile-time problems?

I am not aware about any particular config dependency. Seems to be
everywhere.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by David Hildenbrand (Red Hat) 3 weeks, 1 day ago
On 1/15/26 19:58, Kiryl Shutsemau wrote:
> On Thu, Jan 15, 2026 at 06:41:44PM +0100, David Hildenbrand (Red Hat) wrote:
>> On 1/15/26 18:23, Kiryl Shutsemau wrote:
>>> On Thu, Jan 15, 2026 at 05:49:43PM +0100, David Hildenbrand (Red Hat) wrote:
>>>> On 1/15/26 15:45, Kiryl Shutsemau wrote:
>>>>> HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
>>>>> vmemmap pages for huge pages and remapping the freed range to a single
>>>>> page containing the struct page metadata.
>>>>>
>>>>> With the new mask-based compound_info encoding (for power-of-2 struct
>>>>> page sizes), all tail pages of the same order are now identical
>>>>> regardless of which compound page they belong to. This means the tail
>>>>> pages can be truly shared without fake heads.
>>>>>
>>>>> Allocate a single page of initialized tail struct pages per NUMA node
>>>>> per order in the vmemmap_tails[] array in pglist_data. All huge pages
>>>>> of that order on the node share this tail page, mapped read-only into
>>>>> their vmemmap. The head page remains unique per huge page.
>>>>>
>>>>> This eliminates fake heads while maintaining the same memory savings,
>>>>> and simplifies compound_head() by removing fake head detection.
>>>>>
>>>>> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
>>>>> ---
>>>>>     include/linux/mmzone.h | 16 ++++++++++++++-
>>>>>     mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
>>>>>     mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
>>>>>     3 files changed, 93 insertions(+), 11 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>> index 322ed4c42cfc..2ee3eb610291 100644
>>>>> --- a/include/linux/mmzone.h
>>>>> +++ b/include/linux/mmzone.h
>>>>> @@ -82,7 +82,11 @@
>>>>>      * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
>>>>>      * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
>>>>>      */
>>>>> -#define MAX_FOLIO_ORDER		get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
>>>>> +#ifdef CONFIG_64BIT
>>>>> +#define MAX_FOLIO_ORDER		(34 - PAGE_SHIFT)
>>>>> +#else
>>>>> +#define MAX_FOLIO_ORDER		(30 - PAGE_SHIFT)
>>>>> +#endif
>>>>
>>>> Where do these magic values stem from, and how do they related to the
>>>> comment above that clearly spells out 16G vs. 1G ?
>>>
>>> This doesn't change the resulting value: 1UL << 34 is 16GiB, 1UL << 30
>>> is 1G. Subtract PAGE_SHIFT to get the order.
>>>
>>> The change allows the value to be used to define NR_VMEMMAP_TAILS which
>>> is used specify size of vmemmap_tails array.
>>
>> get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) should evaluate to a
>> constant by the compiler.
>>
>> See __builtin_constant_p handling in get_order().
>>
>> If that is not working then we have to figure out why.
> 
> asm-offsets.s compilation fails:
> 
> ../include/linux/mmzone.h:1574:16: error: fields must have a constant size:
>        'variable length array in structure' extension will never be supported
>   1574 |         unsigned long vmemmap_tails[NR_VMEMMAP_TAILS];
> 
> Here's how preprocessor dump of vmemmap_tails looks like:
> 
>   unsigned long vmemmap_tails[(get_order(1 ? (0x400000000ULL) : 0x40000000) - (( __builtin_constant_p(2 * ((1UL) << 12) / sizeof(struct page)) ? ((2 * ((1UL) << 12) / sizeof(struct page)) < 2 ? 0 : 63 - __builtin_clzll(2 * ((1UL) << 12) / sizeof(struct page))) : (sizeof(2 * ((1UL) << 12) / sizeof(struct page)) <= 4) ? __ilog2_u32(2 * ((1UL) << 12) / sizeof(struct page)) : __ilog2_u64(2 * ((1UL) << 12) / sizeof(struct page)) )) + 1)];
> 
> And here's get_order():
> 
> static inline __attribute__((__gnu_inline__)) __attribute__((__unused__)) __attribute__((no_instrument_function)) __attribute__((__always_inline__)) __attribute__((__const__)) int get_order(unsigned long size)
> {
>   if (__builtin_constant_p(size)) {
>    if (!size)
>     return 64 - 12;
> 
>    if (size < (1UL << 12))
>     return 0;
> 
>    return ( __builtin_constant_p((size) - 1) ? (((size) - 1) < 2 ? 0 : 63 - __builtin_clzll((size) - 1)) : (sizeof((size) - 1) <= 4) ? __ilog2_u32((size) - 1) : __ilog2_u64((size) - 1) ) - 12 + 1;
>   }
> 
>   size--;
>   size >>= 12;
> 
> 
> 
>   return fls64(size);
> 
> }
> 
> I am not sure why it is not compile-time constant. I have not dig
> deeper.

Very weird. Almost sounds like a bug given that get_order() ends up using ilog2.

But it gets even weirder:

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6f959d8ca4b42..a54445682ccc4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2281,6 +2281,9 @@ static inline unsigned long folio_nr_pages(const struct folio *folio)
   * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
   */
  #define MAX_FOLIO_ORDER                get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
+
+static_assert(__builtin_constant_p(MAX_FOLIO_ORDER));
+
  #else
  /*
   * Without hugetlb, gigantic folios that are bigger than a single PUD are

gives me


./include/linux/build_bug.h:78:41: error: static assertion failed: "__builtin_constant_p(MAX_FOLIO_ORDER)"
    78 | #define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
       |                                         ^~~~~~~~~~~~~~
./include/linux/build_bug.h:77:34: note: in expansion of macro '__static_assert'
    77 | #define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr)
       |                                  ^~~~~~~~~~~~~~~
./include/linux/mm.h:2285:1: note: in expansion of macro 'static_assert'
  2285 | static_assert(__builtin_constant_p(MAX_FOLIO_ORDER));
       | ^~~~~~~~~~~~~

And reversing the condition fixes it.

... so it is a constant? Huh?



Some history on the SZ change here: https://lore.kernel.org/all/a31e6d70-9275-4277-991b-9de1aea03cd7@csgroup.eu/

> 
> Switching to ilog2(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) - PAGE_SHIFT works,
> but I personally find my variant more readable.
> 

Using SZ_16G/SZ_1G, is self-documenting. I'm fine with repeating the ilog2 like:


ifdef CONFIG_64BIT
#define MAX_FOLIO_ORDER		(ilog2(SZ_16G) - PAGE_SHIFT)
...


Also, make sure to spell that out in the patch description.


Figuring out why we don't get a constant would be even nicer ... or why this does something else
than expected.

-- 
Cheers

David
Re: [PATCHv3 10/15] mm/hugetlb: Remove fake head pages
Posted by David Hildenbrand (Red Hat) 3 weeks, 1 day ago
On 1/15/26 20:33, David Hildenbrand (Red Hat) wrote:
> On 1/15/26 19:58, Kiryl Shutsemau wrote:
>> On Thu, Jan 15, 2026 at 06:41:44PM +0100, David Hildenbrand (Red Hat) wrote:
>>> On 1/15/26 18:23, Kiryl Shutsemau wrote:
>>>> On Thu, Jan 15, 2026 at 05:49:43PM +0100, David Hildenbrand (Red Hat) wrote:
>>>>> On 1/15/26 15:45, Kiryl Shutsemau wrote:
>>>>>> HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
>>>>>> vmemmap pages for huge pages and remapping the freed range to a single
>>>>>> page containing the struct page metadata.
>>>>>>
>>>>>> With the new mask-based compound_info encoding (for power-of-2 struct
>>>>>> page sizes), all tail pages of the same order are now identical
>>>>>> regardless of which compound page they belong to. This means the tail
>>>>>> pages can be truly shared without fake heads.
>>>>>>
>>>>>> Allocate a single page of initialized tail struct pages per NUMA node
>>>>>> per order in the vmemmap_tails[] array in pglist_data. All huge pages
>>>>>> of that order on the node share this tail page, mapped read-only into
>>>>>> their vmemmap. The head page remains unique per huge page.
>>>>>>
>>>>>> This eliminates fake heads while maintaining the same memory savings,
>>>>>> and simplifies compound_head() by removing fake head detection.
>>>>>>
>>>>>> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
>>>>>> ---
>>>>>>      include/linux/mmzone.h | 16 ++++++++++++++-
>>>>>>      mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
>>>>>>      mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
>>>>>>      3 files changed, 93 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
>>>>>> index 322ed4c42cfc..2ee3eb610291 100644
>>>>>> --- a/include/linux/mmzone.h
>>>>>> +++ b/include/linux/mmzone.h
>>>>>> @@ -82,7 +82,11 @@
>>>>>>       * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
>>>>>>       * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
>>>>>>       */
>>>>>> -#define MAX_FOLIO_ORDER		get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
>>>>>> +#ifdef CONFIG_64BIT
>>>>>> +#define MAX_FOLIO_ORDER		(34 - PAGE_SHIFT)
>>>>>> +#else
>>>>>> +#define MAX_FOLIO_ORDER		(30 - PAGE_SHIFT)
>>>>>> +#endif
>>>>>
>>>>> Where do these magic values stem from, and how do they related to the
>>>>> comment above that clearly spells out 16G vs. 1G ?
>>>>
>>>> This doesn't change the resulting value: 1UL << 34 is 16GiB, 1UL << 30
>>>> is 1G. Subtract PAGE_SHIFT to get the order.
>>>>
>>>> The change allows the value to be used to define NR_VMEMMAP_TAILS which
>>>> is used specify size of vmemmap_tails array.
>>>
>>> get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G) should evaluate to a
>>> constant by the compiler.
>>>
>>> See __builtin_constant_p handling in get_order().
>>>
>>> If that is not working then we have to figure out why.
>>
>> asm-offsets.s compilation fails:
>>
>> ../include/linux/mmzone.h:1574:16: error: fields must have a constant size:
>>         'variable length array in structure' extension will never be supported
>>    1574 |         unsigned long vmemmap_tails[NR_VMEMMAP_TAILS];
>>
>> Here's how preprocessor dump of vmemmap_tails looks like:
>>
>>    unsigned long vmemmap_tails[(get_order(1 ? (0x400000000ULL) : 0x40000000) - (( __builtin_constant_p(2 * ((1UL) << 12) / sizeof(struct page)) ? ((2 * ((1UL) << 12) / sizeof(struct page)) < 2 ? 0 : 63 - __builtin_clzll(2 * ((1UL) << 12) / sizeof(struct page))) : (sizeof(2 * ((1UL) << 12) / sizeof(struct page)) <= 4) ? __ilog2_u32(2 * ((1UL) << 12) / sizeof(struct page)) : __ilog2_u64(2 * ((1UL) << 12) / sizeof(struct page)) )) + 1)];
>>
>> And here's get_order():
>>
>> static inline __attribute__((__gnu_inline__)) __attribute__((__unused__)) __attribute__((no_instrument_function)) __attribute__((__always_inline__)) __attribute__((__const__)) int get_order(unsigned long size)
>> {
>>    if (__builtin_constant_p(size)) {
>>     if (!size)
>>      return 64 - 12;
>>
>>     if (size < (1UL << 12))
>>      return 0;
>>
>>     return ( __builtin_constant_p((size) - 1) ? (((size) - 1) < 2 ? 0 : 63 - __builtin_clzll((size) - 1)) : (sizeof((size) - 1) <= 4) ? __ilog2_u32((size) - 1) : __ilog2_u64((size) - 1) ) - 12 + 1;
>>    }
>>
>>    size--;
>>    size >>= 12;
>>
>>
>>
>>    return fls64(size);
>>
>> }
>>
>> I am not sure why it is not compile-time constant. I have not dig
>> deeper.
> 
> Very weird. Almost sounds like a bug given that get_order() ends up using ilog2.
> 
> But it gets even weirder:
> 
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 6f959d8ca4b42..a54445682ccc4 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -2281,6 +2281,9 @@ static inline unsigned long folio_nr_pages(const struct folio *folio)
>     * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
>     */
>    #define MAX_FOLIO_ORDER                get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
> +
> +static_assert(__builtin_constant_p(MAX_FOLIO_ORDER));
> +
>    #else
>    /*
>     * Without hugetlb, gigantic folios that are bigger than a single PUD are
> 
> gives me
> 
> 
> ./include/linux/build_bug.h:78:41: error: static assertion failed: "__builtin_constant_p(MAX_FOLIO_ORDER)"
>      78 | #define __static_assert(expr, msg, ...) _Static_assert(expr, msg)
>         |                                         ^~~~~~~~~~~~~~
> ./include/linux/build_bug.h:77:34: note: in expansion of macro '__static_assert'
>      77 | #define static_assert(expr, ...) __static_assert(expr, ##__VA_ARGS__, #expr)
>         |                                  ^~~~~~~~~~~~~~~
> ./include/linux/mm.h:2285:1: note: in expansion of macro 'static_assert'
>    2285 | static_assert(__builtin_constant_p(MAX_FOLIO_ORDER));
>         | ^~~~~~~~~~~~~
> 
> And reversing the condition fixes it.
> 
> ... so it is a constant? Huh?

I've been staring at the computer for too long, this is not BUILD_BUG 
semantics. So we don't get a constant.

For some reason :)

Even when I just use get_order(4096).

-- 
Cheers

David
[PATCHv3.1 10/15] mm/hugetlb: Remove fake head pages
Posted by Kiryl Shutsemau 3 weeks, 1 day ago
HugeTLB Vmemmap Optimization (HVO) reduces memory usage by freeing most
vmemmap pages for huge pages and remapping the freed range to a single
page containing the struct page metadata.

With the new mask-based compound_info encoding (for power-of-2 struct
page sizes), all tail pages of the same order are now identical
regardless of which compound page they belong to. This means the tail
pages can be truly shared without fake heads.

Allocate a single page of initialized tail struct pages per NUMA node
per order in the vmemmap_tails[] array in pglist_data. All huge pages of
that order on the node share this tail page, mapped read-only into their
vmemmap. The head page remains unique per huge page.

Redefine MAX_FOLIO_ORDER using ilog2(). The define has to produce a
compile-constant as it is used to specify vmemmap_tail array size.
For some reason, compiler is not able to solve get_order() at
compile-time, but ilog2() works.

This eliminates fake heads while maintaining the same memory savings,
and simplifies compound_head() by removing fake head detection.

Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
---

v3.1:
 - Define MAX_FOLIO_ORDER using ilog2();
 - Update commit message;

---
 include/linux/mmzone.h | 16 ++++++++++++++-
 mm/hugetlb_vmemmap.c   | 44 ++++++++++++++++++++++++++++++++++++++++--
 mm/sparse-vmemmap.c    | 44 ++++++++++++++++++++++++++++++++++--------
 3 files changed, 93 insertions(+), 11 deletions(-)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 322ed4c42cfc..bc333546c2d3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -82,7 +82,11 @@
  * currently expect (see CONFIG_HAVE_GIGANTIC_FOLIOS): with hugetlb, we expect
  * no folios larger than 16 GiB on 64bit and 1 GiB on 32bit.
  */
-#define MAX_FOLIO_ORDER		get_order(IS_ENABLED(CONFIG_64BIT) ? SZ_16G : SZ_1G)
+#ifdef CONFIG_64BIT
+#define MAX_FOLIO_ORDER		(ilog2(SZ_16G) - PAGE_SHIFT)
+#else
+#define MAX_FOLIO_ORDER		(ilog2(SZ_1G) - PAGE_SHIFT)
+#endif
 #else
 /*
  * Without hugetlb, gigantic folios that are bigger than a single PUD are
@@ -1408,6 +1412,13 @@ struct memory_failure_stats {
 };
 #endif
 
+/*
+ * vmemmap optimization (like HVO) is only possible for page orders that fill
+ * two or more pages with struct pages.
+ */
+#define VMEMMAP_TAIL_MIN_ORDER (ilog2(2 * PAGE_SIZE / sizeof(struct page)))
+#define NR_VMEMMAP_TAILS (MAX_FOLIO_ORDER - VMEMMAP_TAIL_MIN_ORDER + 1)
+
 /*
  * On NUMA machines, each NUMA node would have a pg_data_t to describe
  * it's memory layout. On UMA machines there is a single pglist_data which
@@ -1556,6 +1567,9 @@ typedef struct pglist_data {
 #ifdef CONFIG_MEMORY_FAILURE
 	struct memory_failure_stats mf_stats;
 #endif
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+	unsigned long vmemmap_tails[NR_VMEMMAP_TAILS];
+#endif
 } pg_data_t;
 
 #define node_present_pages(nid)	(NODE_DATA(nid)->node_present_pages)
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 2b19c2205091..cbdca4684db1 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -18,6 +18,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "hugetlb_vmemmap.h"
+#include "internal.h"
 
 /**
  * struct vmemmap_remap_walk - walk vmemmap page table
@@ -517,6 +518,41 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *
 	return true;
 }
 
+static struct page *vmemmap_get_tail(unsigned int order, int node)
+{
+	unsigned long pfn;
+	unsigned int idx;
+	struct page *tail, *p;
+
+	idx = order - VMEMMAP_TAIL_MIN_ORDER;
+	pfn =  NODE_DATA(node)->vmemmap_tails[idx];
+	if (pfn)
+		return pfn_to_page(pfn);
+
+	tail = alloc_pages_node(node, GFP_KERNEL, 0);
+	if (!tail)
+		return NULL;
+
+	p = page_to_virt(tail);
+	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
+		prep_compound_tail(p + i, NULL, order);
+
+	spin_lock(&hugetlb_lock);
+	if (!NODE_DATA(node)->vmemmap_tails[idx]) {
+		pfn = PHYS_PFN(virt_to_phys(p));
+		NODE_DATA(node)->vmemmap_tails[idx] = pfn;
+		tail = NULL;
+	} else {
+		pfn = NODE_DATA(node)->vmemmap_tails[idx];
+	}
+	spin_unlock(&hugetlb_lock);
+
+	if (tail)
+		__free_page(tail);
+
+	return pfn_to_page(pfn);
+}
+
 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 					    struct folio *folio,
 					    struct list_head *vmemmap_pages,
@@ -532,6 +568,12 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 	if (!vmemmap_should_optimize_folio(h, folio))
 		return ret;
 
+	nid = folio_nid(folio);
+
+	vmemmap_tail = vmemmap_get_tail(h->order, nid);
+	if (!vmemmap_tail)
+		return -ENOMEM;
+
 	static_branch_inc(&hugetlb_optimize_vmemmap_key);
 
 	if (flags & VMEMMAP_SYNCHRONIZE_RCU)
@@ -549,7 +591,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 	 */
 	folio_set_hugetlb_vmemmap_optimized(folio);
 
-	nid = folio_nid(folio);
 	vmemmap_head = alloc_pages_node(nid, GFP_KERNEL, 0);
 
 	if (!vmemmap_head) {
@@ -561,7 +602,6 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 	list_add(&vmemmap_head->lru, vmemmap_pages);
 	memmap_pages_add(1);
 
-	vmemmap_tail	= vmemmap_head;
 	vmemmap_start	= (unsigned long)folio;
 	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
 
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index dbd8daccade2..94b4e90fa00f 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -378,16 +378,45 @@ void vmemmap_wrprotect_hvo(unsigned long addr, unsigned long end,
 	}
 }
 
-/*
- * Populate vmemmap pages HVO-style. The first page contains the head
- * page and needed tail pages, the other ones are mirrors of the first
- * page.
- */
+static __meminit unsigned long vmemmap_get_tail(unsigned int order, int node)
+{
+	unsigned long pfn;
+	unsigned int idx;
+	struct page *p;
+
+	BUG_ON(order < VMEMMAP_TAIL_MIN_ORDER);
+	BUG_ON(order > MAX_FOLIO_ORDER);
+
+	idx = order - VMEMMAP_TAIL_MIN_ORDER;
+	pfn =  NODE_DATA(node)->vmemmap_tails[idx];
+	if (pfn)
+		return pfn;
+
+	p = vmemmap_alloc_block_zero(PAGE_SIZE, node);
+	if (!p)
+		return 0;
+
+	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
+		prep_compound_tail(p + i, NULL, order);
+
+	pfn = PHYS_PFN(virt_to_phys(p));
+	NODE_DATA(node)->vmemmap_tails[idx] = pfn;
+
+	return pfn;
+}
+
 int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
 				       int node, unsigned long headsize)
 {
+	unsigned long maddr, len, tail_pfn;
+	unsigned int order;
 	pte_t *pte;
-	unsigned long maddr;
+
+	len = end - addr;
+	order = ilog2(len * sizeof(struct page) / PAGE_SIZE);
+	tail_pfn = vmemmap_get_tail(order, node);
+	if (!tail_pfn)
+		return -ENOMEM;
 
 	for (maddr = addr; maddr < addr + headsize; maddr += PAGE_SIZE) {
 		pte = vmemmap_populate_address(maddr, node, NULL, -1, 0);
@@ -398,8 +427,7 @@ int __meminit vmemmap_populate_hvo(unsigned long addr, unsigned long end,
 	/*
 	 * Reuse the last page struct page mapped above for the rest.
 	 */
-	return vmemmap_populate_range(maddr, end, node, NULL,
-					pte_pfn(ptep_get(pte)), 0);
+	return vmemmap_populate_range(maddr, end, node, NULL, tail_pfn, 0);
 }
 
 void __weak __meminit vmemmap_set_pmd(pmd_t *pmd, void *p, int node,
-- 
2.51.2