[PATCH 06/11] mm/hugetlb: Remove fake head pages

Kiryl Shutsemau posted 11 patches 1 week, 3 days ago
[PATCH 06/11] mm/hugetlb: Remove fake head pages
Posted by Kiryl Shutsemau 1 week, 3 days ago
HugeTLB optimizes vmemmap memory usage by freeing all but the first page
of vmemmap memory for the huge page and remapping the rest of the pages
to the first one.

This only occurs if the size of the struct page is a power of 2. In
these instances, the compound head position encoding in the tail pages
ensures that all tail pages of the same order are identical, regardless
of the page to which they belong.

This allows for the elimination of fake head pages without significant
memory overhead: a page full of tail struct pages is allocated per
hstate and mapped instead of the page with the head page for all pages
of the given hstate.

Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
---
 include/linux/hugetlb.h |  3 +++
 mm/hugetlb_vmemmap.c    | 31 +++++++++++++++++++++++++++----
 mm/hugetlb_vmemmap.h    |  4 ++--
 3 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 8e63e46b8e1f..75dd940fda22 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -676,6 +676,9 @@ struct hstate {
 	unsigned int free_huge_pages_node[MAX_NUMNODES];
 	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
 	char name[HSTATE_NAME_LEN];
+#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+	struct page *vmemmap_tail;
+#endif
 };
 
 struct cma;
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index f5ee499b8563..2543bdbcae20 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -18,6 +18,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
 #include "hugetlb_vmemmap.h"
+#include "internal.h"
 
 /**
  * struct vmemmap_remap_walk - walk vmemmap page table
@@ -518,7 +519,24 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *
 	return true;
 }
 
-static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
+static void hugetlb_vmemmap_tail_alloc(struct hstate *h)
+{
+	struct page *p;
+
+	if (h->vmemmap_tail)
+		return;
+
+	h->vmemmap_tail = alloc_page(GFP_KERNEL | __GFP_ZERO);
+	if (!h->vmemmap_tail)
+		return;
+
+	p = page_to_virt(h->vmemmap_tail);
+
+	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
+		prep_compound_tail(p + i, p, huge_page_order(h));
+}
+
+static int __hugetlb_vmemmap_optimize_folio(struct hstate *h,
 					    struct folio *folio,
 					    struct list_head *vmemmap_pages,
 					    unsigned long flags)
@@ -533,6 +551,11 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 	if (!vmemmap_should_optimize_folio(h, folio))
 		return ret;
 
+	if (!h->vmemmap_tail)
+		hugetlb_vmemmap_tail_alloc(h);
+	if (!h->vmemmap_tail)
+		return -ENOMEM;
+
 	static_branch_inc(&hugetlb_optimize_vmemmap_key);
 
 	if (flags & VMEMMAP_SYNCHRONIZE_RCU)
@@ -562,7 +585,7 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
 	list_add(&vmemmap_head->lru, vmemmap_pages);
 	memmap_pages_add(1);
 
-	vmemmap_tail	= vmemmap_head;
+	vmemmap_tail	= h->vmemmap_tail;
 	vmemmap_start	= (unsigned long)folio;
 	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
 
@@ -594,7 +617,7 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
  * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
  * vmemmap pages have been optimized.
  */
-void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
+void hugetlb_vmemmap_optimize_folio(struct hstate *h, struct folio *folio)
 {
 	LIST_HEAD(vmemmap_pages);
 
@@ -868,7 +891,7 @@ static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
 
 static int __init hugetlb_vmemmap_init(void)
 {
-	const struct hstate *h;
+	struct hstate *h;
 
 	/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
 	BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
index 18b490825215..f44e40c44a21 100644
--- a/mm/hugetlb_vmemmap.h
+++ b/mm/hugetlb_vmemmap.h
@@ -24,7 +24,7 @@ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio);
 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 					struct list_head *folio_list,
 					struct list_head *non_hvo_folios);
-void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio);
+void hugetlb_vmemmap_optimize_folio(struct hstate *h, struct folio *folio);
 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
 void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list);
 #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
@@ -64,7 +64,7 @@ static inline long hugetlb_vmemmap_restore_folios(const struct hstate *h,
 	return 0;
 }
 
-static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
+static inline void hugetlb_vmemmap_optimize_folio(struct hstate *h, struct folio *folio)
 {
 }
 
-- 
2.51.2
Re: [PATCH 06/11] mm/hugetlb: Remove fake head pages
Posted by Usama Arif 1 week, 2 days ago

On 05/12/2025 19:43, Kiryl Shutsemau wrote:
> HugeTLB optimizes vmemmap memory usage by freeing all but the first page
> of vmemmap memory for the huge page and remapping the rest of the pages
> to the first one.
> 
> This only occurs if the size of the struct page is a power of 2. In
> these instances, the compound head position encoding in the tail pages
> ensures that all tail pages of the same order are identical, regardless
> of the page to which they belong.
> 
> This allows for the elimination of fake head pages without significant
> memory overhead: a page full of tail struct pages is allocated per
> hstate and mapped instead of the page with the head page for all pages
> of the given hstate.
> 
> Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
> ---
>  include/linux/hugetlb.h |  3 +++
>  mm/hugetlb_vmemmap.c    | 31 +++++++++++++++++++++++++++----
>  mm/hugetlb_vmemmap.h    |  4 ++--
>  3 files changed, 32 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
> index 8e63e46b8e1f..75dd940fda22 100644
> --- a/include/linux/hugetlb.h
> +++ b/include/linux/hugetlb.h
> @@ -676,6 +676,9 @@ struct hstate {
>  	unsigned int free_huge_pages_node[MAX_NUMNODES];
>  	unsigned int surplus_huge_pages_node[MAX_NUMNODES];
>  	char name[HSTATE_NAME_LEN];
> +#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
> +	struct page *vmemmap_tail;
> +#endif
>  };
>  
>  struct cma;
> diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
> index f5ee499b8563..2543bdbcae20 100644
> --- a/mm/hugetlb_vmemmap.c
> +++ b/mm/hugetlb_vmemmap.c
> @@ -18,6 +18,7 @@
>  #include <asm/pgalloc.h>
>  #include <asm/tlbflush.h>
>  #include "hugetlb_vmemmap.h"
> +#include "internal.h"
>  
>  /**
>   * struct vmemmap_remap_walk - walk vmemmap page table
> @@ -518,7 +519,24 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *
>  	return true;
>  }
>  
> -static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
> +static void hugetlb_vmemmap_tail_alloc(struct hstate *h)
> +{
> +	struct page *p;
> +
> +	if (h->vmemmap_tail)
> +		return;
> +

The above check is unnecessary as we already check for !h->vmemmap_tail in __hugetlb_vmemmap_optimize_folio?

Is it possible that we could have a race here? Where 2 threads both trying to allocate a hugetlb page when none
exist in the system, both see h->vmemmap_tail == NULL, both call alloc_page and set h->vmemmap_tail?

Also, is there a good point where we can see that the number of hstate->nr_huge_pages has gone down to 0 and free
h->vmemmap_tail? Its a single page per hstate so not a big deal, but would be nice to have cleanup for it?

> +	h->vmemmap_tail = alloc_page(GFP_KERNEL | __GFP_ZERO);
> +	if (!h->vmemmap_tail)
> +		return;
> +
> +	p = page_to_virt(h->vmemmap_tail);
> +
> +	for (int i = 0; i < PAGE_SIZE / sizeof(struct page); i++)
> +		prep_compound_tail(p + i, p, huge_page_order(h));
> +}
> +
> +static int __hugetlb_vmemmap_optimize_folio(struct hstate *h,
>  					    struct folio *folio,
>  					    struct list_head *vmemmap_pages,
>  					    unsigned long flags)
> @@ -533,6 +551,11 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
>  	if (!vmemmap_should_optimize_folio(h, folio))
>  		return ret;
>  
> +	if (!h->vmemmap_tail)
> +		hugetlb_vmemmap_tail_alloc(h);
> +	if (!h->vmemmap_tail)
> +		return -ENOMEM;
> +
>  	static_branch_inc(&hugetlb_optimize_vmemmap_key);
>  
>  	if (flags & VMEMMAP_SYNCHRONIZE_RCU)
> @@ -562,7 +585,7 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
>  	list_add(&vmemmap_head->lru, vmemmap_pages);
>  	memmap_pages_add(1);
>  
> -	vmemmap_tail	= vmemmap_head;
> +	vmemmap_tail	= h->vmemmap_tail;
>  	vmemmap_start	= (unsigned long)folio;
>  	vmemmap_end	= vmemmap_start + hugetlb_vmemmap_size(h);
>  
> @@ -594,7 +617,7 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
>   * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
>   * vmemmap pages have been optimized.
>   */
> -void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
> +void hugetlb_vmemmap_optimize_folio(struct hstate *h, struct folio *folio)
>  {
>  	LIST_HEAD(vmemmap_pages);
>  
> @@ -868,7 +891,7 @@ static const struct ctl_table hugetlb_vmemmap_sysctls[] = {
>  
>  static int __init hugetlb_vmemmap_init(void)
>  {
> -	const struct hstate *h;
> +	struct hstate *h;
>  
>  	/* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
>  	BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
> diff --git a/mm/hugetlb_vmemmap.h b/mm/hugetlb_vmemmap.h
> index 18b490825215..f44e40c44a21 100644
> --- a/mm/hugetlb_vmemmap.h
> +++ b/mm/hugetlb_vmemmap.h
> @@ -24,7 +24,7 @@ int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio);
>  long hugetlb_vmemmap_restore_folios(const struct hstate *h,
>  					struct list_head *folio_list,
>  					struct list_head *non_hvo_folios);
> -void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio);
> +void hugetlb_vmemmap_optimize_folio(struct hstate *h, struct folio *folio);
>  void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list);
>  void hugetlb_vmemmap_optimize_bootmem_folios(struct hstate *h, struct list_head *folio_list);
>  #ifdef CONFIG_SPARSEMEM_VMEMMAP_PREINIT
> @@ -64,7 +64,7 @@ static inline long hugetlb_vmemmap_restore_folios(const struct hstate *h,
>  	return 0;
>  }
>  
> -static inline void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
> +static inline void hugetlb_vmemmap_optimize_folio(struct hstate *h, struct folio *folio)
>  {
>  }
>
Re: [PATCH 06/11] mm/hugetlb: Remove fake head pages
Posted by Kiryl Shutsemau 1 week ago
On Sat, Dec 06, 2025 at 05:03:25PM +0000, Usama Arif wrote:
> > @@ -518,7 +519,24 @@ static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *
> >  	return true;
> >  }
> >  
> > -static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
> > +static void hugetlb_vmemmap_tail_alloc(struct hstate *h)
> > +{
> > +	struct page *p;
> > +
> > +	if (h->vmemmap_tail)
> > +		return;
> > +
> 
> The above check is unnecessary as we already check for
> !h->vmemmap_tail in __hugetlb_vmemmap_optimize_folio?

Right. I will streamline this codepath.

> Is it possible that we could have a race here? Where 2 threads both
> trying to allocate a hugetlb page when none exist in the system, both
> see h->vmemmap_tail == NULL, both call alloc_page and set
> h->vmemmap_tail?

Good catch. Will fix. I guess, serializing h-vmemmap_tail with
hugetlb_lock should be good enough.

> Also, is there a good point where we can see that the number of
> hstate->nr_huge_pages has gone down to 0 and free
> h->vmemmap_tail? Its a single page per hstate so not a big deal, but
> would be nice to have cleanup for it?

I didn't want to go to this complexity, but if you folks think that it
is needed, sure, can do.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov