This propagates the hugepage size from the memblock APIs
(memblock_alloc_try_nid_raw and memblock_alloc_range_nid)
so that it can be stored in struct memblock region. This does not
introduce any functional change and hugepage_size is not used in
this commit. It is just a setup for the next commit where huge_pagesize
is used to skip initialization of struct pages that will be freed later
when HVO is enabled.
Signed-off-by: Usama Arif <usama.arif@bytedance.com>
---
arch/arm64/mm/kasan_init.c | 2 +-
arch/powerpc/platforms/pasemi/iommu.c | 2 +-
arch/powerpc/platforms/pseries/setup.c | 4 +-
arch/powerpc/sysdev/dart_iommu.c | 2 +-
include/linux/memblock.h | 8 ++-
mm/cma.c | 4 +-
mm/hugetlb.c | 6 +-
mm/memblock.c | 60 ++++++++++++--------
mm/mm_init.c | 2 +-
mm/sparse-vmemmap.c | 2 +-
tools/testing/memblock/tests/alloc_nid_api.c | 2 +-
11 files changed, 56 insertions(+), 38 deletions(-)
diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index f17d066e85eb..39992a418891 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -50,7 +50,7 @@ static phys_addr_t __init kasan_alloc_raw_page(int node)
void *p = memblock_alloc_try_nid_raw(PAGE_SIZE, PAGE_SIZE,
__pa(MAX_DMA_ADDRESS),
MEMBLOCK_ALLOC_NOLEAKTRACE,
- node);
+ node, 0);
if (!p)
panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%llx\n",
__func__, PAGE_SIZE, PAGE_SIZE, node,
diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c
index 375487cba874..6963cdf76bce 100644
--- a/arch/powerpc/platforms/pasemi/iommu.c
+++ b/arch/powerpc/platforms/pasemi/iommu.c
@@ -201,7 +201,7 @@ static int __init iob_init(struct device_node *dn)
/* For 2G space, 8x64 pages (2^21 bytes) is max total l2 size */
iob_l2_base = memblock_alloc_try_nid_raw(1UL << 21, 1UL << 21,
MEMBLOCK_LOW_LIMIT, 0x80000000,
- NUMA_NO_NODE);
+ NUMA_NO_NODE, 0);
if (!iob_l2_base)
panic("%s: Failed to allocate %lu bytes align=0x%lx max_addr=%x\n",
__func__, 1UL << 21, 1UL << 21, 0x80000000);
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index e2a57cfa6c83..cec7198b59d2 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -160,7 +160,7 @@ static void __init fwnmi_init(void)
*/
mce_data_buf = memblock_alloc_try_nid_raw(RTAS_ERROR_LOG_MAX * nr_cpus,
RTAS_ERROR_LOG_MAX, MEMBLOCK_LOW_LIMIT,
- ppc64_rma_size, NUMA_NO_NODE);
+ ppc64_rma_size, NUMA_NO_NODE, 0);
if (!mce_data_buf)
panic("Failed to allocate %d bytes below %pa for MCE buffer\n",
RTAS_ERROR_LOG_MAX * nr_cpus, &ppc64_rma_size);
@@ -176,7 +176,7 @@ static void __init fwnmi_init(void)
size = sizeof(struct slb_entry) * mmu_slb_size * nr_cpus;
slb_ptr = memblock_alloc_try_nid_raw(size,
sizeof(struct slb_entry), MEMBLOCK_LOW_LIMIT,
- ppc64_rma_size, NUMA_NO_NODE);
+ ppc64_rma_size, NUMA_NO_NODE, 0);
if (!slb_ptr)
panic("Failed to allocate %zu bytes below %pa for slb area\n",
size, &ppc64_rma_size);
diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c
index 98096bbfd62e..86c676b61899 100644
--- a/arch/powerpc/sysdev/dart_iommu.c
+++ b/arch/powerpc/sysdev/dart_iommu.c
@@ -239,7 +239,7 @@ static void __init allocate_dart(void)
*/
dart_tablebase = memblock_alloc_try_nid_raw(SZ_16M, SZ_16M,
MEMBLOCK_LOW_LIMIT, SZ_2G,
- NUMA_NO_NODE);
+ NUMA_NO_NODE, 0);
if (!dart_tablebase)
panic("Failed to allocate 16MB below 2GB for DART table\n");
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index f71ff9f0ec81..bb8019540d73 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -63,6 +63,7 @@ struct memblock_region {
#ifdef CONFIG_NUMA
int nid;
#endif
+ phys_addr_t hugepage_size;
};
/**
@@ -400,7 +401,8 @@ phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,
phys_addr_t start, phys_addr_t end);
phys_addr_t memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid, bool exact_nid);
+ phys_addr_t end, int nid, bool exact_nid,
+ phys_addr_t hugepage_size);
phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
static __always_inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
@@ -415,7 +417,7 @@ void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align,
int nid);
void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
- int nid);
+ int nid, phys_addr_t hugepage_size);
void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
int nid);
@@ -431,7 +433,7 @@ static inline void *memblock_alloc_raw(phys_addr_t size,
{
return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT,
MEMBLOCK_ALLOC_ACCESSIBLE,
- NUMA_NO_NODE);
+ NUMA_NO_NODE, 0);
}
static inline void *memblock_alloc_from(phys_addr_t size,
diff --git a/mm/cma.c b/mm/cma.c
index a4cfe995e11e..a270905aa7f2 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -334,7 +334,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (!memblock_bottom_up() && memblock_end >= SZ_4G + size) {
memblock_set_bottom_up(true);
addr = memblock_alloc_range_nid(size, alignment, SZ_4G,
- limit, nid, true);
+ limit, nid, true, 0);
memblock_set_bottom_up(false);
}
#endif
@@ -353,7 +353,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
if (!addr) {
addr = memblock_alloc_range_nid(size, alignment, base,
- limit, nid, true);
+ limit, nid, true, 0);
if (!addr) {
ret = -ENOMEM;
goto err;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 24352abbb9e5..5ba7fd702458 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3168,7 +3168,8 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
/* do node specific alloc */
if (nid != NUMA_NO_NODE) {
m = memblock_alloc_try_nid_raw(huge_page_size(h), huge_page_size(h),
- 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, nid,
+ hugetlb_vmemmap_optimizable(h) ? huge_page_size(h) : 0);
if (!m)
return 0;
goto found;
@@ -3177,7 +3178,8 @@ int __alloc_bootmem_huge_page(struct hstate *h, int nid)
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
m = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
- 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, node,
+ hugetlb_vmemmap_optimizable(h) ? huge_page_size(h) : 0);
/*
* Use the beginning of the huge page to store the
* huge_bootmem_page struct (until gather_bootmem
diff --git a/mm/memblock.c b/mm/memblock.c
index f9e61e565a53..e92d437bcb51 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -549,7 +549,8 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
int idx, phys_addr_t base,
phys_addr_t size,
int nid,
- enum memblock_flags flags)
+ enum memblock_flags flags,
+ phys_addr_t hugepage_size)
{
struct memblock_region *rgn = &type->regions[idx];
@@ -558,6 +559,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
rgn->base = base;
rgn->size = size;
rgn->flags = flags;
+ rgn->hugepage_size = hugepage_size;
memblock_set_region_node(rgn, nid);
type->cnt++;
type->total_size += size;
@@ -581,7 +583,7 @@ static void __init_memblock memblock_insert_region(struct memblock_type *type,
*/
static int __init_memblock memblock_add_range(struct memblock_type *type,
phys_addr_t base, phys_addr_t size,
- int nid, enum memblock_flags flags)
+ int nid, enum memblock_flags flags, phys_addr_t hugepage_size)
{
bool insert = false;
phys_addr_t obase = base;
@@ -598,6 +600,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
type->regions[0].base = base;
type->regions[0].size = size;
type->regions[0].flags = flags;
+ type->regions[0].hugepage_size = hugepage_size;
memblock_set_region_node(&type->regions[0], nid);
type->total_size = size;
return 0;
@@ -646,7 +649,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
end_rgn = idx + 1;
memblock_insert_region(type, idx++, base,
rbase - base, nid,
- flags);
+ flags, hugepage_size);
}
}
/* area below @rend is dealt with, forget about it */
@@ -661,7 +664,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
start_rgn = idx;
end_rgn = idx + 1;
memblock_insert_region(type, idx, base, end - base,
- nid, flags);
+ nid, flags, hugepage_size);
}
}
@@ -705,7 +708,7 @@ int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
&base, &end, nid, flags, (void *)_RET_IP_);
- return memblock_add_range(&memblock.memory, base, size, nid, flags);
+ return memblock_add_range(&memblock.memory, base, size, nid, flags, 0);
}
/**
@@ -726,7 +729,7 @@ int __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
- return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0, 0);
}
/**
@@ -782,7 +785,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
type->total_size -= base - rbase;
memblock_insert_region(type, idx, rbase, base - rbase,
memblock_get_region_node(rgn),
- rgn->flags);
+ rgn->flags, 0);
} else if (rend > end) {
/*
* @rgn intersects from above. Split and redo the
@@ -793,7 +796,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type,
type->total_size -= end - rbase;
memblock_insert_region(type, idx--, rbase, end - rbase,
memblock_get_region_node(rgn),
- rgn->flags);
+ rgn->flags, 0);
} else {
/* @rgn is fully contained, record it */
if (!*end_rgn)
@@ -863,14 +866,20 @@ int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
return memblock_remove_range(&memblock.reserved, base, size);
}
-int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+int __init_memblock memblock_reserve_huge(phys_addr_t base, phys_addr_t size,
+ phys_addr_t hugepage_size)
{
phys_addr_t end = base + size - 1;
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
- return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0, hugepage_size);
+}
+
+int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_reserve_huge(base, size, 0);
}
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -881,7 +890,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
- return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0, 0);
}
#endif
@@ -1365,6 +1374,7 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
* @end: the upper bound of the memory region to allocate (phys address)
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
* @exact_nid: control the allocation fall back to other nodes
+ * @hugepage_size: size of the hugepages in bytes
*
* The allocation is performed from memory region limited by
* memblock.current_limit if @end == %MEMBLOCK_ALLOC_ACCESSIBLE.
@@ -1385,7 +1395,7 @@ __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid,
- bool exact_nid)
+ bool exact_nid, phys_addr_t hugepage_size)
{
enum memblock_flags flags = choose_memblock_flags();
phys_addr_t found;
@@ -1402,14 +1412,14 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
again:
found = memblock_find_in_range_node(size, align, start, end, nid,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !memblock_reserve_huge(found, size, hugepage_size))
goto done;
if (nid != NUMA_NO_NODE && !exact_nid) {
found = memblock_find_in_range_node(size, align, start,
end, NUMA_NO_NODE,
flags);
- if (found && !memblock_reserve(found, size))
+ if (found && !memblock_reserve_huge(found, size, hugepage_size))
goto done;
}
@@ -1469,7 +1479,7 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
__func__, (u64)size, (u64)align, &start, &end,
(void *)_RET_IP_);
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
- false);
+ false, 0);
}
/**
@@ -1488,7 +1498,7 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
{
return memblock_alloc_range_nid(size, align, 0,
- MEMBLOCK_ALLOC_ACCESSIBLE, nid, false);
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid, false, 0);
}
/**
@@ -1514,7 +1524,7 @@ phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t ali
static void * __init memblock_alloc_internal(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
- int nid, bool exact_nid)
+ int nid, bool exact_nid, phys_addr_t hugepage_size)
{
phys_addr_t alloc;
@@ -1530,12 +1540,12 @@ static void * __init memblock_alloc_internal(
max_addr = memblock.current_limit;
alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid,
- exact_nid);
+ exact_nid, hugepage_size);
/* retry allocation without lower limit */
if (!alloc && min_addr)
alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid,
- exact_nid);
+ exact_nid, hugepage_size);
if (!alloc)
return NULL;
@@ -1571,7 +1581,7 @@ void * __init memblock_alloc_exact_nid_raw(
&max_addr, (void *)_RET_IP_);
return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
- true);
+ true, 0);
}
/**
@@ -1585,25 +1595,29 @@ void * __init memblock_alloc_exact_nid_raw(
* is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
* allocate only from memory limited by memblock.current_limit value
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ * @hugepage_size: size of the hugepages in bytes
*
* Public function, provides additional debug information (including caller
* info), if enabled. Does not zero allocated memory, does not panic if request
* cannot be satisfied.
*
+ * If hugepage_size is not 0 and HVO is enabled, then only the struct pages
+ * that are not freed by HVO are initialized using the hugepage_size parameter.
+ *
* Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
void * __init memblock_alloc_try_nid_raw(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
- int nid)
+ int nid, phys_addr_t hugepage_size)
{
memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
return memblock_alloc_internal(size, align, min_addr, max_addr, nid,
- false);
+ false, hugepage_size);
}
/**
@@ -1634,7 +1648,7 @@ void * __init memblock_alloc_try_nid(
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
ptr = memblock_alloc_internal(size, align,
- min_addr, max_addr, nid, false);
+ min_addr, max_addr, nid, false, 0);
if (ptr)
memset(ptr, 0, size);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index a1963c3322af..c36d768bb671 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1615,7 +1615,7 @@ void __init *memmap_alloc(phys_addr_t size, phys_addr_t align,
else
ptr = memblock_alloc_try_nid_raw(size, align, min_addr,
MEMBLOCK_ALLOC_ACCESSIBLE,
- nid);
+ nid, 0);
if (ptr && size > 0)
page_init_poison(ptr, size);
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a044a130405b..56b8b8e684df 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -43,7 +43,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node,
unsigned long goal)
{
return memblock_alloc_try_nid_raw(size, align, goal,
- MEMBLOCK_ALLOC_ACCESSIBLE, node);
+ MEMBLOCK_ALLOC_ACCESSIBLE, node, 0);
}
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
diff --git a/tools/testing/memblock/tests/alloc_nid_api.c b/tools/testing/memblock/tests/alloc_nid_api.c
index 49bb416d34ff..225044366fbb 100644
--- a/tools/testing/memblock/tests/alloc_nid_api.c
+++ b/tools/testing/memblock/tests/alloc_nid_api.c
@@ -43,7 +43,7 @@ static inline void *run_memblock_alloc_nid(phys_addr_t size,
max_addr, nid);
if (alloc_nid_test_flags & TEST_F_RAW)
return memblock_alloc_try_nid_raw(size, align, min_addr,
- max_addr, nid);
+ max_addr, nid, 0);
return memblock_alloc_try_nid(size, align, min_addr, max_addr, nid);
}
--
2.25.1
On Mon, Jul 24, 2023 at 02:46:42PM +0100, Usama Arif wrote:
> This propagates the hugepage size from the memblock APIs
> (memblock_alloc_try_nid_raw and memblock_alloc_range_nid)
> so that it can be stored in struct memblock region. This does not
> introduce any functional change and hugepage_size is not used in
> this commit. It is just a setup for the next commit where huge_pagesize
> is used to skip initialization of struct pages that will be freed later
> when HVO is enabled.
>
> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
> ---
> arch/arm64/mm/kasan_init.c | 2 +-
> arch/powerpc/platforms/pasemi/iommu.c | 2 +-
> arch/powerpc/platforms/pseries/setup.c | 4 +-
> arch/powerpc/sysdev/dart_iommu.c | 2 +-
> include/linux/memblock.h | 8 ++-
> mm/cma.c | 4 +-
> mm/hugetlb.c | 6 +-
> mm/memblock.c | 60 ++++++++++++--------
> mm/mm_init.c | 2 +-
> mm/sparse-vmemmap.c | 2 +-
> tools/testing/memblock/tests/alloc_nid_api.c | 2 +-
> 11 files changed, 56 insertions(+), 38 deletions(-)
>
[ snip ]
> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> index f71ff9f0ec81..bb8019540d73 100644
> --- a/include/linux/memblock.h
> +++ b/include/linux/memblock.h
> @@ -63,6 +63,7 @@ struct memblock_region {
> #ifdef CONFIG_NUMA
> int nid;
> #endif
> + phys_addr_t hugepage_size;
> };
>
> /**
> @@ -400,7 +401,8 @@ phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,
> phys_addr_t start, phys_addr_t end);
> phys_addr_t memblock_alloc_range_nid(phys_addr_t size,
> phys_addr_t align, phys_addr_t start,
> - phys_addr_t end, int nid, bool exact_nid);
> + phys_addr_t end, int nid, bool exact_nid,
> + phys_addr_t hugepage_size);
Rather than adding yet another parameter to memblock_phys_alloc_range() we
can have an API that sets a flag on the reserved regions.
With this the hugetlb reservation code can set a flag when HVO is
enabled and memmap_init_reserved_pages() will skip regions with this flag
set.
> phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
>
> static __always_inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
> @@ -415,7 +417,7 @@ void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align,
> int nid);
> void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
> phys_addr_t min_addr, phys_addr_t max_addr,
> - int nid);
> + int nid, phys_addr_t hugepage_size);
> void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
> phys_addr_t min_addr, phys_addr_t max_addr,
> int nid);
> @@ -431,7 +433,7 @@ static inline void *memblock_alloc_raw(phys_addr_t size,
> {
> return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT,
> MEMBLOCK_ALLOC_ACCESSIBLE,
> - NUMA_NO_NODE);
> + NUMA_NO_NODE, 0);
> }
>
> static inline void *memblock_alloc_from(phys_addr_t size,
--
Sincerely yours,
Mike.
On 26/07/2023 12:01, Mike Rapoport wrote:
> On Mon, Jul 24, 2023 at 02:46:42PM +0100, Usama Arif wrote:
>> This propagates the hugepage size from the memblock APIs
>> (memblock_alloc_try_nid_raw and memblock_alloc_range_nid)
>> so that it can be stored in struct memblock region. This does not
>> introduce any functional change and hugepage_size is not used in
>> this commit. It is just a setup for the next commit where huge_pagesize
>> is used to skip initialization of struct pages that will be freed later
>> when HVO is enabled.
>>
>> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
>> ---
>> arch/arm64/mm/kasan_init.c | 2 +-
>> arch/powerpc/platforms/pasemi/iommu.c | 2 +-
>> arch/powerpc/platforms/pseries/setup.c | 4 +-
>> arch/powerpc/sysdev/dart_iommu.c | 2 +-
>> include/linux/memblock.h | 8 ++-
>> mm/cma.c | 4 +-
>> mm/hugetlb.c | 6 +-
>> mm/memblock.c | 60 ++++++++++++--------
>> mm/mm_init.c | 2 +-
>> mm/sparse-vmemmap.c | 2 +-
>> tools/testing/memblock/tests/alloc_nid_api.c | 2 +-
>> 11 files changed, 56 insertions(+), 38 deletions(-)
>>
>
> [ snip ]
>
>> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
>> index f71ff9f0ec81..bb8019540d73 100644
>> --- a/include/linux/memblock.h
>> +++ b/include/linux/memblock.h
>> @@ -63,6 +63,7 @@ struct memblock_region {
>> #ifdef CONFIG_NUMA
>> int nid;
>> #endif
>> + phys_addr_t hugepage_size;
>> };
>>
>> /**
>> @@ -400,7 +401,8 @@ phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,
>> phys_addr_t start, phys_addr_t end);
>> phys_addr_t memblock_alloc_range_nid(phys_addr_t size,
>> phys_addr_t align, phys_addr_t start,
>> - phys_addr_t end, int nid, bool exact_nid);
>> + phys_addr_t end, int nid, bool exact_nid,
>> + phys_addr_t hugepage_size);
>
> Rather than adding yet another parameter to memblock_phys_alloc_range() we
> can have an API that sets a flag on the reserved regions.
> With this the hugetlb reservation code can set a flag when HVO is
> enabled and memmap_init_reserved_pages() will skip regions with this flag
> set.
>
Hi,
Thanks for the review.
I think you meant memblock_alloc_range_nid/memblock_alloc_try_nid_raw
and not memblock_phys_alloc_range?
My initial approach was to use flags, but I think it looks worse than
what I have done in this RFC (I have pushed the flags prototype at
https://github.com/uarif1/linux/commits/flags_skip_prep_init_gigantic_HVO,
top 4 commits for reference (the main difference is patch 2 and 4 from
RFC)). The major points are (the bigger issue is in patch 4):
- (RFC vs flags patch 2 comparison) In the RFC, hugepage_size is
propagated from memblock_alloc_try_nid_raw through function calls. When
using flags, the "no_init" boolean is propogated from
memblock_alloc_try_nid_raw through function calls until the region flags
are available in memblock_add_range and the new MEMBLOCK_NOINIT flag is
set. I think its a bit more tricky to introduce a new function to set
the flag in the region AFTER the call to memblock_alloc_try_nid_raw has
finished as the memblock_region can not be found.
So something (hugepage_size/flag information) still has to be propagated
through function calls and a new argument needs to be added.
- (RFC vs flags patch 4 comparison) We can't skip initialization of the
whole region, only the tail pages. We still need to initialize the
HUGETLB_VMEMMAP_RESERVE_SIZE (PAGE_SIZE) struct pages for each gigantic
page.
In the RFC, hugepage_size from patch 2 was used in the for loop in
memmap_init_reserved_pages in patch 4 to reserve
HUGETLB_VMEMMAP_RESERVE_SIZE struct pages for every hugepage_size. This
looks very simple and not hacky.
If we use a flag, there are 2 ways to initialize the
HUGETLB_VMEMMAP_RESERVE_SIZE struct pages per hugepage:
1. (implemented in github link patch 4) memmap_init_reserved_pages skips
the region for initialization as you suggested, and then we initialize
HUGETLB_VMEMMAP_RESERVE_SIZE struct pages per hugepage somewhere later
(I did it in gather_bootmem_prealloc). When calling
reserve_bootmem_region in gather_bootmem_prealloc, we need to skip
early_page_uninitialised and this makes it look a bit hacky.
2. We initialize the HUGETLB_VMEMMAP_RESERVE_SIZE struct pages per
hugepage in memmap_init_reserved_pages itself. As we have used a flag
and havent passed hugepage_size, we need to get the gigantic page size
somehow. There doesnt seem to be a nice way to determine the gigantic
page size in that function which is architecture dependent. I think
gigantic page size can be given by PAGE_SIZE << (PUD_SHIFT -
PAGE_SHIFT), but not sure if this is ok for all architectures? If we can
use PAGE_SIZE << (PUD_SHIFT - PAGE_SHIFT) it will look much better than
point 1.
Both the RFC patches and the github flags implementation work, but I
think RFC patches look much cleaner. If there is a strong preference for
the the github patches I can send it to mailing list?
Thanks,
Usama
>> phys_addr_t memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid);
>>
>> static __always_inline phys_addr_t memblock_phys_alloc(phys_addr_t size,
>> @@ -415,7 +417,7 @@ void *memblock_alloc_exact_nid_raw(phys_addr_t size, phys_addr_t align,
>> int nid);
>> void *memblock_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align,
>> phys_addr_t min_addr, phys_addr_t max_addr,
>> - int nid);
>> + int nid, phys_addr_t hugepage_size);
>> void *memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align,
>> phys_addr_t min_addr, phys_addr_t max_addr,
>> int nid);
>> @@ -431,7 +433,7 @@ static inline void *memblock_alloc_raw(phys_addr_t size,
>> {
>> return memblock_alloc_try_nid_raw(size, align, MEMBLOCK_LOW_LIMIT,
>> MEMBLOCK_ALLOC_ACCESSIBLE,
>> - NUMA_NO_NODE);
>> + NUMA_NO_NODE, 0);
>> }
>>
>> static inline void *memblock_alloc_from(phys_addr_t size,
>
On Wed, Jul 26, 2023 at 04:02:21PM +0100, Usama Arif wrote:
>
> On 26/07/2023 12:01, Mike Rapoport wrote:
> > On Mon, Jul 24, 2023 at 02:46:42PM +0100, Usama Arif wrote:
> > > This propagates the hugepage size from the memblock APIs
> > > (memblock_alloc_try_nid_raw and memblock_alloc_range_nid)
> > > so that it can be stored in struct memblock region. This does not
> > > introduce any functional change and hugepage_size is not used in
> > > this commit. It is just a setup for the next commit where huge_pagesize
> > > is used to skip initialization of struct pages that will be freed later
> > > when HVO is enabled.
> > >
> > > Signed-off-by: Usama Arif <usama.arif@bytedance.com>
> > > ---
> > > arch/arm64/mm/kasan_init.c | 2 +-
> > > arch/powerpc/platforms/pasemi/iommu.c | 2 +-
> > > arch/powerpc/platforms/pseries/setup.c | 4 +-
> > > arch/powerpc/sysdev/dart_iommu.c | 2 +-
> > > include/linux/memblock.h | 8 ++-
> > > mm/cma.c | 4 +-
> > > mm/hugetlb.c | 6 +-
> > > mm/memblock.c | 60 ++++++++++++--------
> > > mm/mm_init.c | 2 +-
> > > mm/sparse-vmemmap.c | 2 +-
> > > tools/testing/memblock/tests/alloc_nid_api.c | 2 +-
> > > 11 files changed, 56 insertions(+), 38 deletions(-)
> > >
> >
> > [ snip ]
> >
> > > diff --git a/include/linux/memblock.h b/include/linux/memblock.h
> > > index f71ff9f0ec81..bb8019540d73 100644
> > > --- a/include/linux/memblock.h
> > > +++ b/include/linux/memblock.h
> > > @@ -63,6 +63,7 @@ struct memblock_region {
> > > #ifdef CONFIG_NUMA
> > > int nid;
> > > #endif
> > > + phys_addr_t hugepage_size;
> > > };
> > > /**
> > > @@ -400,7 +401,8 @@ phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,
> > > phys_addr_t start, phys_addr_t end);
> > > phys_addr_t memblock_alloc_range_nid(phys_addr_t size,
> > > phys_addr_t align, phys_addr_t start,
> > > - phys_addr_t end, int nid, bool exact_nid);
> > > + phys_addr_t end, int nid, bool exact_nid,
> > > + phys_addr_t hugepage_size);
> >
> > Rather than adding yet another parameter to memblock_phys_alloc_range() we
> > can have an API that sets a flag on the reserved regions.
> > With this the hugetlb reservation code can set a flag when HVO is
> > enabled and memmap_init_reserved_pages() will skip regions with this flag
> > set.
> >
>
> Hi,
>
> Thanks for the review.
>
> I think you meant memblock_alloc_range_nid/memblock_alloc_try_nid_raw and
> not memblock_phys_alloc_range?
Yes.
> My initial approach was to use flags, but I think it looks worse than what I
> have done in this RFC (I have pushed the flags prototype at
> https://github.com/uarif1/linux/commits/flags_skip_prep_init_gigantic_HVO,
> top 4 commits for reference (the main difference is patch 2 and 4 from
> RFC)). The major points are (the bigger issue is in patch 4):
>
> - (RFC vs flags patch 2 comparison) In the RFC, hugepage_size is propagated
> from memblock_alloc_try_nid_raw through function calls. When using flags,
> the "no_init" boolean is propogated from memblock_alloc_try_nid_raw through
> function calls until the region flags are available in memblock_add_range
> and the new MEMBLOCK_NOINIT flag is set. I think its a bit more tricky to
> introduce a new function to set the flag in the region AFTER the call to
> memblock_alloc_try_nid_raw has finished as the memblock_region can not be
> found.
> So something (hugepage_size/flag information) still has to be propagated
> through function calls and a new argument needs to be added.
Sorry if I wasn't clear. I didn't mean to add flags parameter, I meant to
add a flag and a function that sets this flag for a range. So for
MEMBLOCK_NOINIT there would be
int memblock_mark_noinit(phys_addr_t base, phys_addr_t size);
I'd just name this flag MEMBLOCK_RSRV_NOINIT to make it clear it controls
the reserved regions.
This won't require updating all call sites of memblock_alloc_range_nid()
and memblock_alloc_try_nid_raw() but only a small refactoring of
memblock_setclr_flag() and its callers.
> - (RFC vs flags patch 4 comparison) We can't skip initialization of the
> whole region, only the tail pages. We still need to initialize the
> HUGETLB_VMEMMAP_RESERVE_SIZE (PAGE_SIZE) struct pages for each gigantic
> page.
> In the RFC, hugepage_size from patch 2 was used in the for loop in
> memmap_init_reserved_pages in patch 4 to reserve
> HUGETLB_VMEMMAP_RESERVE_SIZE struct pages for every hugepage_size. This
> looks very simple and not hacky.
But this requires having hugetlb details in memblock which feels backwards
to me.
With memblock_mark_noinit() you can decide what parts of a gigantic page
should be initialized in __alloc_bootmem_huge_page() and mark as NOINIT
only relevant range.
> If we use a flag, there are 2 ways to initialize the
> HUGETLB_VMEMMAP_RESERVE_SIZE struct pages per hugepage:
>
> 1. (implemented in github link patch 4) memmap_init_reserved_pages skips the
> region for initialization as you suggested, and then we initialize
> HUGETLB_VMEMMAP_RESERVE_SIZE struct pages per hugepage somewhere later (I
> did it in gather_bootmem_prealloc). When calling reserve_bootmem_region in
> gather_bootmem_prealloc, we need to skip early_page_uninitialised and this
> makes it look a bit hacky.
>
> 2. We initialize the HUGETLB_VMEMMAP_RESERVE_SIZE struct pages per hugepage
> in memmap_init_reserved_pages itself. As we have used a flag and havent
> passed hugepage_size, we need to get the gigantic page size somehow. There
> doesnt seem to be a nice way to determine the gigantic page size in that
> function which is architecture dependent. I think gigantic page size can be
> given by PAGE_SIZE << (PUD_SHIFT - PAGE_SHIFT), but not sure if this is ok
> for all architectures? If we can use PAGE_SIZE << (PUD_SHIFT - PAGE_SHIFT)
> it will look much better than point 1.
>
> Both the RFC patches and the github flags implementation work, but I think
> RFC patches look much cleaner. If there is a strong preference for the the
> github patches I can send it to mailing list?
>
> Thanks,
> Usama
--
Sincerely yours,
Mike.
On 27/07/2023 05:30, Mike Rapoport wrote:
> On Wed, Jul 26, 2023 at 04:02:21PM +0100, Usama Arif wrote:
>>
>> On 26/07/2023 12:01, Mike Rapoport wrote:
>>> On Mon, Jul 24, 2023 at 02:46:42PM +0100, Usama Arif wrote:
>>>> This propagates the hugepage size from the memblock APIs
>>>> (memblock_alloc_try_nid_raw and memblock_alloc_range_nid)
>>>> so that it can be stored in struct memblock region. This does not
>>>> introduce any functional change and hugepage_size is not used in
>>>> this commit. It is just a setup for the next commit where huge_pagesize
>>>> is used to skip initialization of struct pages that will be freed later
>>>> when HVO is enabled.
>>>>
>>>> Signed-off-by: Usama Arif <usama.arif@bytedance.com>
>>>> ---
>>>> arch/arm64/mm/kasan_init.c | 2 +-
>>>> arch/powerpc/platforms/pasemi/iommu.c | 2 +-
>>>> arch/powerpc/platforms/pseries/setup.c | 4 +-
>>>> arch/powerpc/sysdev/dart_iommu.c | 2 +-
>>>> include/linux/memblock.h | 8 ++-
>>>> mm/cma.c | 4 +-
>>>> mm/hugetlb.c | 6 +-
>>>> mm/memblock.c | 60 ++++++++++++--------
>>>> mm/mm_init.c | 2 +-
>>>> mm/sparse-vmemmap.c | 2 +-
>>>> tools/testing/memblock/tests/alloc_nid_api.c | 2 +-
>>>> 11 files changed, 56 insertions(+), 38 deletions(-)
>>>>
>>>
>>> [ snip ]
>>>
>>>> diff --git a/include/linux/memblock.h b/include/linux/memblock.h
>>>> index f71ff9f0ec81..bb8019540d73 100644
>>>> --- a/include/linux/memblock.h
>>>> +++ b/include/linux/memblock.h
>>>> @@ -63,6 +63,7 @@ struct memblock_region {
>>>> #ifdef CONFIG_NUMA
>>>> int nid;
>>>> #endif
>>>> + phys_addr_t hugepage_size;
>>>> };
>>>> /**
>>>> @@ -400,7 +401,8 @@ phys_addr_t memblock_phys_alloc_range(phys_addr_t size, phys_addr_t align,
>>>> phys_addr_t start, phys_addr_t end);
>>>> phys_addr_t memblock_alloc_range_nid(phys_addr_t size,
>>>> phys_addr_t align, phys_addr_t start,
>>>> - phys_addr_t end, int nid, bool exact_nid);
>>>> + phys_addr_t end, int nid, bool exact_nid,
>>>> + phys_addr_t hugepage_size);
>>>
>>> Rather than adding yet another parameter to memblock_phys_alloc_range() we
>>> can have an API that sets a flag on the reserved regions.
>>> With this the hugetlb reservation code can set a flag when HVO is
>>> enabled and memmap_init_reserved_pages() will skip regions with this flag
>>> set.
>>>
>>
>> Hi,
>>
>> Thanks for the review.
>>
>> I think you meant memblock_alloc_range_nid/memblock_alloc_try_nid_raw and
>> not memblock_phys_alloc_range?
>
> Yes.
>
>> My initial approach was to use flags, but I think it looks worse than what I
>> have done in this RFC (I have pushed the flags prototype at
>> https://github.com/uarif1/linux/commits/flags_skip_prep_init_gigantic_HVO,
>> top 4 commits for reference (the main difference is patch 2 and 4 from
>> RFC)). The major points are (the bigger issue is in patch 4):
>>
>> - (RFC vs flags patch 2 comparison) In the RFC, hugepage_size is propagated
>> from memblock_alloc_try_nid_raw through function calls. When using flags,
>> the "no_init" boolean is propogated from memblock_alloc_try_nid_raw through
>> function calls until the region flags are available in memblock_add_range
>> and the new MEMBLOCK_NOINIT flag is set. I think its a bit more tricky to
>> introduce a new function to set the flag in the region AFTER the call to
>> memblock_alloc_try_nid_raw has finished as the memblock_region can not be
>> found.
>> So something (hugepage_size/flag information) still has to be propagated
>> through function calls and a new argument needs to be added.
>
> Sorry if I wasn't clear. I didn't mean to add flags parameter, I meant to
> add a flag and a function that sets this flag for a range. So for
> MEMBLOCK_NOINIT there would be
>
> int memblock_mark_noinit(phys_addr_t base, phys_addr_t size);
>
> I'd just name this flag MEMBLOCK_RSRV_NOINIT to make it clear it controls
> the reserved regions.
>
> This won't require updating all call sites of memblock_alloc_range_nid()
> and memblock_alloc_try_nid_raw() but only a small refactoring of
> memblock_setclr_flag() and its callers.
>
Thanks for this, its much cleaner doing the way you described. I have
sent v1 implementing this
https://lore.kernel.org/all/20230727204624.1942372-1-usama.arif@bytedance.com/.
Regards,
Usama
© 2016 - 2026 Red Hat, Inc.