.
>
> Also, can't we fail lightly during the first attempt and dynamically decide if we
> should do a second pase?
>
Good idea, like below
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 753f99b..425a759 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3589,6 +3589,7 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
unsigned long jiffies_start;
unsigned long jiffies_end;
+ unsigned long remaining;
job.thread_fn = hugetlb_pages_alloc_boot_node;
job.start = 0;
@@ -3620,6 +3621,18 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
jiffies_start = jiffies;
padata_do_multithreaded(&job);
+
+ if (h->nr_huge_pages != h->max_huge_pages && hugetlb_vmemmap_optimizable(h)) {
+ remaining = h->max_huge_pages - h->nr_huge_pages;
+ /* vmemmap optimization can save about 1.6% (4/250) memory */
+ remaining = min(remaining, (h->nr_huge_pages * 4 / 250));
+
+ job.start = h->nr_huge_pages;
+ job.size = remaining;
+ job.min_chunk = remaining / hugepage_allocation_threads;
+ padata_do_multithreaded(&job);
+ }
Thanks
-Li
> --
> Cheers
>
> David / dhildenb
On 27.08.25 06:12, Li,Rongqing wrote: > > . >> >> Also, can't we fail lightly during the first attempt and dynamically decide if we >> should do a second pase? >> > > > Good idea, like below > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index 753f99b..425a759 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -3589,6 +3589,7 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) > > unsigned long jiffies_start; > unsigned long jiffies_end; > + unsigned long remaining; > > job.thread_fn = hugetlb_pages_alloc_boot_node; > job.start = 0; > @@ -3620,6 +3621,18 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) > > jiffies_start = jiffies; > padata_do_multithreaded(&job); > + > + if (h->nr_huge_pages != h->max_huge_pages && hugetlb_vmemmap_optimizable(h)) { > + remaining = h->max_huge_pages - h->nr_huge_pages; > + /* vmemmap optimization can save about 1.6% (4/250) memory */ > + remaining = min(remaining, (h->nr_huge_pages * 4 / 250)); I don't like hard coding that here. > + > + job.start = h->nr_huge_pages; > + job.size = remaining; > + job.min_chunk = remaining / hugepage_allocation_threads; > + padata_do_multithreaded(&job); > + } Thinking out load, can't we try in a loop until either a) We allocated all we need b) We don't make any more progress Not sure if something like the following could fly: diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1f42186a85ea4..dfb4d717b8a02 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3595,8 +3595,6 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) unsigned long jiffies_end; job.thread_fn = hugetlb_pages_alloc_boot_node; - job.start = 0; - job.size = h->max_huge_pages; /* * job.max_threads is 25% of the available cpu threads by default. @@ -3620,10 +3618,24 @@ static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h) } job.max_threads = hugepage_allocation_threads; - job.min_chunk = h->max_huge_pages / hugepage_allocation_threads; jiffies_start = jiffies; - padata_do_multithreaded(&job); + /* TODO: comment why we retry and how it interacts with vmemmap op. */ + while (h->nr_huge_pages != h->max_huge_pages) { + unsigned long remaining = h->max_huge_pages - h->nr_huge_pages; + + job.start = h->nr_huge_pages; + job.size = remaining; + job.min_chunk = remaining / hugepage_allocation_threads; + padata_do_multithreaded(&job); + + if (hugetlb_vmemmap_optimizable(h)) + break; + + /* Stop if there is no progress. */ + if (remaining == h->max_huge_pages - h->nr_huge_pages) + break; + } jiffies_end = jiffies; pr_info("HugeTLB: allocation took %dms with hugepage_allocation_threads=%ld\n", -- Cheers David / dhildenb
© 2016 - 2025 Red Hat, Inc.