When invoke move_pfn_range_to_zone or remove_pfn_range_from_zone, it will
update the zone->contiguous by checking the new zone's pfn range from the
beginning to the end, regardless the previous state of the old zone. When
the zone's pfn range is large, the cost of traversing the pfn range to
update the zone->contiguous could be significant.
Add fast paths to quickly detect cases where zone is definitely not
contiguous without scanning the new zone. The cases are: when the new range
did not overlap with previous range, the contiguous should be false; if the
new range adjacent with the previous range, just need to check the new
range; if the new added pages could not fill the hole of previous zone, the
contiguous should be false.
The following test cases of memory hotplug for a VM [1], tested in the
environment [2], show that this optimization can significantly reduce the
memory hotplug time [3].
+----------------+------+---------------+--------------+----------------+
| | Size | Time (before) | Time (after) | Time Reduction |
| +------+---------------+--------------+----------------+
| Plug Memory | 256G | 10s | 2s | 80% |
| +------+---------------+--------------+----------------+
| | 512G | 33s | 6s | 81% |
+----------------+------+---------------+--------------+----------------+
+----------------+------+---------------+--------------+----------------+
| | Size | Time (before) | Time (after) | Time Reduction |
| +------+---------------+--------------+----------------+
| Unplug Memory | 256G | 10s | 2s | 80% |
| +------+---------------+--------------+----------------+
| | 512G | 34s | 6s | 82% |
+----------------+------+---------------+--------------+----------------+
[1] Qemu commands to hotplug 256G/512G memory for a VM:
object_add memory-backend-ram,id=hotmem0,size=256G/512G,share=on
device_add virtio-mem-pci,id=vmem1,memdev=hotmem0,bus=port1
qom-set vmem1 requested-size 256G/512G (Plug Memory)
qom-set vmem1 requested-size 0G (Unplug Memory)
[2] Hardware : Intel Icelake server
Guest Kernel : v6.18-rc2
Qemu : v9.0.0
Launch VM :
qemu-system-x86_64 -accel kvm -cpu host \
-drive file=./Centos10_cloud.qcow2,format=qcow2,if=virtio \
-drive file=./seed.img,format=raw,if=virtio \
-smp 3,cores=3,threads=1,sockets=1,maxcpus=3 \
-m 2G,slots=10,maxmem=2052472M \
-device pcie-root-port,id=port1,bus=pcie.0,slot=1,multifunction=on \
-device pcie-root-port,id=port2,bus=pcie.0,slot=2 \
-nographic -machine q35 \
-nic user,hostfwd=tcp::3000-:22
Guest kernel auto-onlines newly added memory blocks:
echo online > /sys/devices/system/memory/auto_online_blocks
[3] The time from typing the QEMU commands in [1] to when the output of
'grep MemTotal /proc/meminfo' on Guest reflects that all hotplugged
memory is recognized.
Reported-by: Nanhai Zou <nanhai.zou@intel.com>
Reported-by: Chen Zhang <zhangchen.kidd@jd.com>
Tested-by: Yuan Liu <yuan1.liu@intel.com>
Reviewed-by: Tim Chen <tim.c.chen@linux.intel.com>
Reviewed-by: Qiuxu Zhuo <qiuxu.zhuo@intel.com>
Reviewed-by: Yu C Chen <yu.c.chen@intel.com>
Reviewed-by: Pan Deng <pan.deng@intel.com>
Reviewed-by: Nanhai Zou <nanhai.zou@intel.com>
Reviewed-by: Yuan Liu <yuan1.liu@intel.com>
Signed-off-by: Tianyou Li <tianyou.li@intel.com>
---
mm/internal.h | 8 ++++-
mm/memory_hotplug.c | 75 +++++++++++++++++++++++++++++++++++++++++++--
mm/mm_init.c | 12 ++++++--
3 files changed, 89 insertions(+), 6 deletions(-)
diff --git a/mm/internal.h b/mm/internal.h
index e430da900430..828aed5c2fef 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -730,7 +730,13 @@ static inline struct page *pageblock_pfn_to_page(unsigned long start_pfn,
return __pageblock_pfn_to_page(start_pfn, end_pfn, zone);
}
-void set_zone_contiguous(struct zone *zone);
+enum zone_contig_state {
+ ZONE_CONTIG_YES,
+ ZONE_CONTIG_NO,
+ ZONE_CONTIG_MAYBE,
+};
+
+void set_zone_contiguous(struct zone *zone, enum zone_contig_state state);
bool pfn_range_intersects_zones(int nid, unsigned long start_pfn,
unsigned long nr_pages);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index ce6caf8674a5..f51293be12eb 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -544,6 +544,28 @@ static void update_pgdat_span(struct pglist_data *pgdat)
pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
+static enum zone_contig_state zone_contig_state_after_shrinking(struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ const unsigned long end_pfn = start_pfn + nr_pages;
+
+ /*
+ * If the removed pfn range inside the original zone span, the contiguous
+ * property is surely false.
+ */
+ if (start_pfn > zone->zone_start_pfn && end_pfn < zone_end_pfn(zone))
+ return ZONE_CONTIG_NO;
+
+ /* If the removed pfn range is at the beginning or end of the
+ * original zone span, the contiguous property is preserved when
+ * the original zone is contiguous.
+ */
+ if (start_pfn == zone->zone_start_pfn || end_pfn == zone_end_pfn(zone))
+ return zone->contiguous ? ZONE_CONTIG_YES : ZONE_CONTIG_MAYBE;
+
+ return ZONE_CONTIG_MAYBE;
+}
+
void remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages)
@@ -551,6 +573,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
const unsigned long end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long pfn, cur_nr_pages;
+ enum zone_contig_state new_contiguous_state = ZONE_CONTIG_MAYBE;
/* Poison struct pages because they are now uninitialized again. */
for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
@@ -571,12 +594,14 @@ void remove_pfn_range_from_zone(struct zone *zone,
if (zone_is_zone_device(zone))
return;
+ new_contiguous_state = zone_contig_state_after_shrinking(zone, start_pfn,
+ nr_pages);
clear_zone_contiguous(zone);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
update_pgdat_span(pgdat);
- set_zone_contiguous(zone);
+ set_zone_contiguous(zone, new_contiguous_state);
}
/**
@@ -736,6 +761,39 @@ static inline void section_taint_zone_device(unsigned long pfn)
}
#endif
+static enum zone_contig_state zone_contig_state_after_growing(struct zone *zone,
+ unsigned long start_pfn, unsigned long nr_pages)
+{
+ const unsigned long end_pfn = start_pfn + nr_pages;
+
+ if (zone_is_empty(zone))
+ return ZONE_CONTIG_YES;
+
+ /*
+ * If the moved pfn range does not intersect with the original zone spa
+ * the contiguous property is surely false.
+ */
+ if (end_pfn < zone->zone_start_pfn || start_pfn > zone_end_pfn(zone))
+ return ZONE_CONTIG_NO;
+
+ /*
+ * If the moved pfn range is adjacent to the original zone span, given
+ * the moved pfn range's contiguous property is always true, the zone's
+ * contiguous property inherited from the original value.
+ */
+ if (end_pfn == zone->zone_start_pfn || start_pfn == zone_end_pfn(zone))
+ return zone->contiguous ? ZONE_CONTIG_YES : ZONE_CONTIG_NO;
+
+ /*
+ * If the original zone's hole larger than the moved pages in the range
+ * the contiguous property is surely false.
+ */
+ if (nr_pages < (zone->spanned_pages - zone->present_pages))
+ return ZONE_CONTIG_NO;
+
+ return ZONE_CONTIG_MAYBE;
+}
+
/*
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
@@ -1090,11 +1148,20 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
{
unsigned long end_pfn = pfn + nr_pages;
int ret, i;
+ enum zone_contig_state new_contiguous_state = ZONE_CONTIG_NO;
ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
if (ret)
return ret;
+ /*
+ * If the allocated memmap pages are not in a full section, keep the
+ * contiguous state as ZONE_CONTIG_NO.
+ */
+ if (IS_ALIGNED(end_pfn, PAGES_PER_SECTION))
+ new_contiguous_state = zone_contig_state_after_growing(zone,
+ pfn, nr_pages);
+
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
false);
@@ -1113,7 +1180,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (nr_pages >= PAGES_PER_SECTION)
online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
- set_zone_contiguous(zone);
+ set_zone_contiguous(zone, new_contiguous_state);
return ret;
}
@@ -1153,6 +1220,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
const int nid = zone_to_nid(zone);
int need_zonelists_rebuild = 0;
unsigned long flags;
+ enum zone_contig_state new_contiguous_state = ZONE_CONTIG_NO;
int ret;
/*
@@ -1166,6 +1234,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
!IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
return -EINVAL;
+ new_contiguous_state = zone_contig_state_after_growing(zone, pfn, nr_pages);
/* associate pfn range with the zone */
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE,
@@ -1204,7 +1273,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
}
online_pages_range(pfn, nr_pages);
- set_zone_contiguous(zone);
+ set_zone_contiguous(zone, new_contiguous_state);
adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
if (node_arg.nid >= 0)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index fc2a6f1e518f..0c41f1004847 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2263,11 +2263,19 @@ void __init init_cma_pageblock(struct page *page)
}
#endif
-void set_zone_contiguous(struct zone *zone)
+void set_zone_contiguous(struct zone *zone, enum zone_contig_state state)
{
unsigned long block_start_pfn = zone->zone_start_pfn;
unsigned long block_end_pfn;
+ if (state == ZONE_CONTIG_YES) {
+ zone->contiguous = true;
+ return;
+ }
+
+ if (state == ZONE_CONTIG_NO)
+ return;
+
block_end_pfn = pageblock_end_pfn(block_start_pfn);
for (; block_start_pfn < zone_end_pfn(zone);
block_start_pfn = block_end_pfn,
@@ -2348,7 +2356,7 @@ void __init page_alloc_init_late(void)
shuffle_free_memory(NODE_DATA(nid));
for_each_populated_zone(zone)
- set_zone_contiguous(zone);
+ set_zone_contiguous(zone, ZONE_CONTIG_MAYBE);
/* Initialize page ext after all struct pages are initialized. */
if (deferred_struct_pages)
--
2.47.1
One more thing, the fact that the clear_zone_contiguous() happens in move_pfn_range_to_zone() is a bit suboptimal. Maybe we need a comment here like /* * Calculate the new zone contig state before move_pfn_range_to_zone() * sets the zone temporarily to non-contiguous. */ Or something like that. Alternatively, we have to rework the code a bit that this dependency is a bit clearer. > + new_contiguous_state = zone_contig_state_after_growing(zone, pfn, nr_pages); > > /* associate pfn range with the zone */ > move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE, > @@ -1204,7 +1273,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages, > } > > online_pages_range(pfn, nr_pages); > - set_zone_contiguous(zone); > + set_zone_contiguous(zone, new_contiguous_state); > adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); -- Cheers David
On 1/7/2026 4:25 AM, David Hildenbrand (Red Hat) wrote: > > One more thing, the fact that the clear_zone_contiguous() happens in > move_pfn_range_to_zone() is a bit suboptimal. > > Maybe we need a comment here like > > /* > * Calculate the new zone contig state before move_pfn_range_to_zone() > * sets the zone temporarily to non-contiguous. > */ > > Or something like that. > Will change accordingly. Thanks. > Alternatively, we have to rework the code a bit that this dependency > is a bit clearer. Agreed, probably we can have this optimization first then explore a more explicit approach for the zone contiguous state management. > >> + new_contiguous_state = zone_contig_state_after_growing(zone, >> pfn, nr_pages); >> /* associate pfn range with the zone */ >> move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE, >> @@ -1204,7 +1273,7 @@ int online_pages(unsigned long pfn, unsigned >> long nr_pages, >> } >> online_pages_range(pfn, nr_pages); >> - set_zone_contiguous(zone); >> + set_zone_contiguous(zone, new_contiguous_state); >> adjust_present_page_count(pfn_to_page(pfn), group, nr_pages); > Regards, Tianyou
On 12/22/25 15:58, Tianyou Li wrote:
> When invoke move_pfn_range_to_zone or remove_pfn_range_from_zone, it will
> update the zone->contiguous by checking the new zone's pfn range from the
> beginning to the end, regardless the previous state of the old zone. When
> the zone's pfn range is large, the cost of traversing the pfn range to
> update the zone->contiguous could be significant.
>
> Add fast paths to quickly detect cases where zone is definitely not
> contiguous without scanning the new zone. The cases are: when the new range
> did not overlap with previous range, the contiguous should be false; if the
> new range adjacent with the previous range, just need to check the new
> range; if the new added pages could not fill the hole of previous zone, the
> contiguous should be false.
>
> The following test cases of memory hotplug for a VM [1], tested in the
> environment [2], show that this optimization can significantly reduce the
> memory hotplug time [3].
>
> +----------------+------+---------------+--------------+----------------+
> | | Size | Time (before) | Time (after) | Time Reduction |
> | +------+---------------+--------------+----------------+
> | Plug Memory | 256G | 10s | 2s | 80% |
> | +------+---------------+--------------+----------------+
> | | 512G | 33s | 6s | 81% |
> +----------------+------+---------------+--------------+----------------+
>
> +----------------+------+---------------+--------------+----------------+
> | | Size | Time (before) | Time (after) | Time Reduction |
> | +------+---------------+--------------+----------------+
> | Unplug Memory | 256G | 10s | 2s | 80% |
> | +------+---------------+--------------+----------------+
> | | 512G | 34s | 6s | 82% |
> +----------------+------+---------------+--------------+----------------+
>
Again, very nice results.
[...]
>
> +static enum zone_contig_state zone_contig_state_after_shrinking(struct zone *zone,
> + unsigned long start_pfn, unsigned long nr_pages)
> +{
> + const unsigned long end_pfn = start_pfn + nr_pages;
> +
> + /*
> + * If the removed pfn range inside the original zone span, the contiguous
> + * property is surely false.
/*
* If we cut a hole into the zone span, then the zone is
* certainly not contiguous.
*/
> + */
> + if (start_pfn > zone->zone_start_pfn && end_pfn < zone_end_pfn(zone))
> + return ZONE_CONTIG_NO;
> +
> + /* If the removed pfn range is at the beginning or end of the
> + * original zone span, the contiguous property is preserved when
> + * the original zone is contiguous.
/* Removing from the start/end of the zone will not change anything. */
> + */
> + if (start_pfn == zone->zone_start_pfn || end_pfn == zone_end_pfn(zone))
> + return zone->contiguous ? ZONE_CONTIG_YES : ZONE_CONTIG_MAYBE;
> +
> + return ZONE_CONTIG_MAYBE;
> +}
> +
> void remove_pfn_range_from_zone(struct zone *zone,
> unsigned long start_pfn,
> unsigned long nr_pages)
> @@ -551,6 +573,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
> const unsigned long end_pfn = start_pfn + nr_pages;
> struct pglist_data *pgdat = zone->zone_pgdat;
> unsigned long pfn, cur_nr_pages;
> + enum zone_contig_state new_contiguous_state = ZONE_CONTIG_MAYBE;
No need to initialize, given that you overwrite the value below.
>
> /* Poison struct pages because they are now uninitialized again. */
> for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
> @@ -571,12 +594,14 @@ void remove_pfn_range_from_zone(struct zone *zone,
> if (zone_is_zone_device(zone))
> return;
>
> + new_contiguous_state = zone_contig_state_after_shrinking(zone, start_pfn,
> + nr_pages);
> clear_zone_contiguous(zone);
>
> shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
> update_pgdat_span(pgdat);
>
> - set_zone_contiguous(zone);
> + set_zone_contiguous(zone, new_contiguous_state);
> }
>
> /**
> @@ -736,6 +761,39 @@ static inline void section_taint_zone_device(unsigned long pfn)
> }
> #endif
>
> +static enum zone_contig_state zone_contig_state_after_growing(struct zone *zone,
> + unsigned long start_pfn, unsigned long nr_pages)
> +{
> + const unsigned long end_pfn = start_pfn + nr_pages;
> +
> + if (zone_is_empty(zone))
> + return ZONE_CONTIG_YES;
> +
> + /*
> + * If the moved pfn range does not intersect with the original zone spa
s/spa/span/
> + * the contiguous property is surely false.
"the zone is surely not contiguous."
> + */
> + if (end_pfn < zone->zone_start_pfn || start_pfn > zone_end_pfn(zone))
> + return ZONE_CONTIG_NO;
> +
> + /*
> + * If the moved pfn range is adjacent to the original zone span, given
> + * the moved pfn range's contiguous property is always true, the zone's
> + * contiguous property inherited from the original value.
> + */
/* Adding to the start/end of the zone will not change anything. */
> + if (end_pfn == zone->zone_start_pfn || start_pfn == zone_end_pfn(zone))
> + return zone->contiguous ? ZONE_CONTIG_YES : ZONE_CONTIG_NO;
> +
> + /*
> + * If the original zone's hole larger than the moved pages in the range
> + * the contiguous property is surely false.
> + */
/* If we cannot fill the hole, the zone stays not contiguous. */
> + if (nr_pages < (zone->spanned_pages - zone->present_pages))
> + return ZONE_CONTIG_NO;
> +
> + return ZONE_CONTIG_MAYBE;
> +}
> +
> /*
> * Associate the pfn range with the given zone, initializing the memmaps
> * and resizing the pgdat/zone data to span the added pages. After this
> @@ -1090,11 +1148,20 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
> {
> unsigned long end_pfn = pfn + nr_pages;
> int ret, i;
> + enum zone_contig_state new_contiguous_state = ZONE_CONTIG_NO;
>
> ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)), PFN_PHYS(nr_pages));
> if (ret)
> return ret;
>
> + /*
> + * If the allocated memmap pages are not in a full section, keep the
> + * contiguous state as ZONE_CONTIG_NO.
> + */
> + if (IS_ALIGNED(end_pfn, PAGES_PER_SECTION))
> + new_contiguous_state = zone_contig_state_after_growing(zone,
> + pfn, nr_pages);
> +
This is nasty. I would wish we could just leave that code path alone.
In particular: I am 99% sure that we never ever run into this case in
practice.
E.g., on x86, we can have up to 2 GiB memory blocks. But the memmap of
that is 64/4096*2GiB == 32 MB ... and a memory section is 128 MiB.
As commented on patch #1, we should drop the set_zone_contiguous() in
this function either way and let online_pages() deal with it.
We just have to make sure that we don't create some inconsistencies by
doing that.
Can you double-check?
> move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
> false);
>
> @@ -1113,7 +1180,7 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
> if (nr_pages >= PAGES_PER_SECTION)
> online_mem_sections(pfn, ALIGN_DOWN(end_pfn, PAGES_PER_SECTION));
>
> - set_zone_contiguous(zone);
> + set_zone_contiguous(zone, new_contiguous_state);
> return ret;
> }
>
> @@ -1153,6 +1220,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
> const int nid = zone_to_nid(zone);
> int need_zonelists_rebuild = 0;
> unsigned long flags;
> + enum zone_contig_state new_contiguous_state = ZONE_CONTIG_NO;
> int ret;
>
> /*
> @@ -1166,6 +1234,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
> !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
> return -EINVAL;
>
> + new_contiguous_state = zone_contig_state_after_growing(zone, pfn, nr_pages);
>
> /* associate pfn range with the zone */
> move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE,
> @@ -1204,7 +1273,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages,
> }
>
> online_pages_range(pfn, nr_pages);
> - set_zone_contiguous(zone);
> + set_zone_contiguous(zone, new_contiguous_state);
> adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
>
> if (node_arg.nid >= 0)
> diff --git a/mm/mm_init.c b/mm/mm_init.c
> index fc2a6f1e518f..0c41f1004847 100644
> --- a/mm/mm_init.c
> +++ b/mm/mm_init.c
> @@ -2263,11 +2263,19 @@ void __init init_cma_pageblock(struct page *page)
> }
> #endif
>
> -void set_zone_contiguous(struct zone *zone)
> +void set_zone_contiguous(struct zone *zone, enum zone_contig_state state)
> {
> unsigned long block_start_pfn = zone->zone_start_pfn;
> unsigned long block_end_pfn;
>
> + if (state == ZONE_CONTIG_YES) {
> + zone->contiguous = true;
> + return;
> + }
> +
Maybe add a comment like
/* We expect an earlier call to clear_zone_contig(). */
And maybe move that comment all the way up in the function and add
VM_WARN_ON_ONCE(zone->contiguous);
> + if (state == ZONE_CONTIG_NO)
> + return;
> +
> block_end_pfn = pageblock_end_pfn(block_start_pfn);
> for (; block_start_pfn < zone_end_pfn(zone);
> block_start_pfn = block_end_pfn,
> @@ -2348,7 +2356,7 @@ void __init page_alloc_init_late(void)
> shuffle_free_memory(NODE_DATA(nid));
>
> for_each_populated_zone(zone)
> - set_zone_contiguous(zone);
> + set_zone_contiguous(zone, ZONE_CONTIG_MAYBE);
>
> /* Initialize page ext after all struct pages are initialized. */
> if (deferred_struct_pages)
--
Cheers
David
Resent to fix the format.
On 1/7/2026 4:18 AM, David Hildenbrand (Red Hat) wrote:
> On 12/22/25 15:58, Tianyou Li wrote:
>> When invoke move_pfn_range_to_zone or remove_pfn_range_from_zone, it
>> will
>> update the zone->contiguous by checking the new zone's pfn range from
>> the
>> beginning to the end, regardless the previous state of the old zone.
>> When
>> the zone's pfn range is large, the cost of traversing the pfn range to
>> update the zone->contiguous could be significant.
>>
>> Add fast paths to quickly detect cases where zone is definitely not
>> contiguous without scanning the new zone. The cases are: when the new
>> range
>> did not overlap with previous range, the contiguous should be false;
>> if the
>> new range adjacent with the previous range, just need to check the new
>> range; if the new added pages could not fill the hole of previous
>> zone, the
>> contiguous should be false.
>>
>> The following test cases of memory hotplug for a VM [1], tested in the
>> environment [2], show that this optimization can significantly reduce
>> the
>> memory hotplug time [3].
>>
>> +----------------+------+---------------+--------------+----------------+
>>
>> | | Size | Time (before) | Time (after) | Time
>> Reduction |
>> | +------+---------------+--------------+----------------+
>> | Plug Memory | 256G | 10s | 2s | 80% |
>> | +------+---------------+--------------+----------------+
>> | | 512G | 33s | 6s | 81% |
>> +----------------+------+---------------+--------------+----------------+
>>
>>
>> +----------------+------+---------------+--------------+----------------+
>>
>> | | Size | Time (before) | Time (after) | Time
>> Reduction |
>> | +------+---------------+--------------+----------------+
>> | Unplug Memory | 256G | 10s | 2s | 80% |
>> | +------+---------------+--------------+----------------+
>> | | 512G | 34s | 6s | 82% |
>> +----------------+------+---------------+--------------+----------------+
>>
>>
>
> Again, very nice results.
>
> [...]
>
Thanks David.
>> +static enum zone_contig_state
>> zone_contig_state_after_shrinking(struct zone *zone,
>> + unsigned long start_pfn, unsigned long nr_pages)
>> +{
>> + const unsigned long end_pfn = start_pfn + nr_pages;
>> +
>> + /*
>> + * If the removed pfn range inside the original zone span, the
>> contiguous
>> + * property is surely false.
>
> /*
> * If we cut a hole into the zone span, then the zone is
> * certainly not contiguous.
> */
Will change accordingly. Thanks.
>
>> + */
>> + if (start_pfn > zone->zone_start_pfn && end_pfn <
>> zone_end_pfn(zone))
>> + return ZONE_CONTIG_NO;
>> +
>> + /* If the removed pfn range is at the beginning or end of the
>> + * original zone span, the contiguous property is preserved when
>> + * the original zone is contiguous.
>
> /* Removing from the start/end of the zone will not change anything. */
>
Will change accordingly. Thanks.
>> + */
>> + if (start_pfn == zone->zone_start_pfn || end_pfn ==
>> zone_end_pfn(zone))
>> + return zone->contiguous ? ZONE_CONTIG_YES : ZONE_CONTIG_MAYBE;
>> +
>> + return ZONE_CONTIG_MAYBE;
>> +}
>> +
>> void remove_pfn_range_from_zone(struct zone *zone,
>> unsigned long start_pfn,
>> unsigned long nr_pages)
>> @@ -551,6 +573,7 @@ void remove_pfn_range_from_zone(struct zone *zone,
>> const unsigned long end_pfn = start_pfn + nr_pages;
>> struct pglist_data *pgdat = zone->zone_pgdat;
>> unsigned long pfn, cur_nr_pages;
>> + enum zone_contig_state new_contiguous_state = ZONE_CONTIG_MAYBE;
>
> No need to initialize, given that you overwrite the value below.
>
Will change accordingly. Thanks.
>> /* Poison struct pages because they are now uninitialized
>> again. */
>> for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
>> @@ -571,12 +594,14 @@ void remove_pfn_range_from_zone(struct zone *zone,
>> if (zone_is_zone_device(zone))
>> return;
>> + new_contiguous_state = zone_contig_state_after_shrinking(zone,
>> start_pfn,
>> + nr_pages);
>> clear_zone_contiguous(zone);
>> shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
>> update_pgdat_span(pgdat);
>> - set_zone_contiguous(zone);
>> + set_zone_contiguous(zone, new_contiguous_state);
>> }
>> /**
>> @@ -736,6 +761,39 @@ static inline void
>> section_taint_zone_device(unsigned long pfn)
>> }
>> #endif
>> +static enum zone_contig_state
>> zone_contig_state_after_growing(struct zone *zone,
>> + unsigned long start_pfn, unsigned long nr_pages)
>> +{
>> + const unsigned long end_pfn = start_pfn + nr_pages;
>> +
>> + if (zone_is_empty(zone))
>> + return ZONE_CONTIG_YES;
>> +
>> + /*
>> + * If the moved pfn range does not intersect with the original
>> zone spa
>
> s/spa/span/
>
My mistake:(
Will change accordingly. Thanks.
>> + * the contiguous property is surely false.
>
> "the zone is surely not contiguous."
>
Will change accordingly. Thanks.
>> + */
>> + if (end_pfn < zone->zone_start_pfn || start_pfn >
>> zone_end_pfn(zone))
>> + return ZONE_CONTIG_NO;
>> +
>> + /*
>> + * If the moved pfn range is adjacent to the original zone span,
>> given
>> + * the moved pfn range's contiguous property is always true, the
>> zone's
>> + * contiguous property inherited from the original value.
>> + */
>
> /* Adding to the start/end of the zone will not change anything. */
>
Will change accordingly. Thanks.
>> + if (end_pfn == zone->zone_start_pfn || start_pfn ==
>> zone_end_pfn(zone))
>> + return zone->contiguous ? ZONE_CONTIG_YES : ZONE_CONTIG_NO;
>> +
>> + /*
>> + * If the original zone's hole larger than the moved pages in
>> the range
>> + * the contiguous property is surely false.
>> + */
>
> /* If we cannot fill the hole, the zone stays not contiguous. */
>
Will change accordingly. Thanks.
>> + if (nr_pages < (zone->spanned_pages - zone->present_pages))
>> + return ZONE_CONTIG_NO;
>> +
>> + return ZONE_CONTIG_MAYBE;
>> +}
>> +
>> /*
>> * Associate the pfn range with the given zone, initializing the
>> memmaps
>> * and resizing the pgdat/zone data to span the added pages. After
>> this
>> @@ -1090,11 +1148,20 @@ int mhp_init_memmap_on_memory(unsigned long
>> pfn, unsigned long nr_pages,
>> {
>> unsigned long end_pfn = pfn + nr_pages;
>> int ret, i;
>> + enum zone_contig_state new_contiguous_state = ZONE_CONTIG_NO;
>> ret = kasan_add_zero_shadow(__va(PFN_PHYS(pfn)),
>> PFN_PHYS(nr_pages));
>> if (ret)
>> return ret;
>> + /*
>> + * If the allocated memmap pages are not in a full section, keep
>> the
>> + * contiguous state as ZONE_CONTIG_NO.
>> + */
>> + if (IS_ALIGNED(end_pfn, PAGES_PER_SECTION))
>> + new_contiguous_state = zone_contig_state_after_growing(zone,
>> + pfn, nr_pages);
>> +
>
> This is nasty. I would wish we could just leave that code path alone.
>
> In particular: I am 99% sure that we never ever run into this case in
> practice.
>
> E.g., on x86, we can have up to 2 GiB memory blocks. But the memmap of
> that is 64/4096*2GiB == 32 MB ... and a memory section is 128 MiB.
>
>
> As commented on patch #1, we should drop the set_zone_contiguous() in
> this function either way and let online_pages() deal with it.
>
> We just have to make sure that we don't create some inconsistencies by
> doing that.
>
> Can you double-check?
>
Agreed, it is very corner case that only when the end_pfn aligned with
the PAGES_PER_SECTION, all the memory sections will be onlined, then the
pfn_to_online_page will get non-NULL value thus the zone contiguous has
the chance to be true, otherwise the zone contiguous will always be
false. In practice it will rarely get touched.
While this optimization relies on the previous contiguous state of the
zone, in the corner case the zone contiguous could be true, but without
set_zone_contiguous(), the zone's contiguous will remain as false, then
the fast path in the online_pages() could be incorrect.
I agree in patch #1 we can remove the set_zone_contiguous because at
that time the online_pages did not depend on the previous contiguous
state. Now, after this optimization, this part of code seems necessary
to be kept here.
One solution is to simplify it by change the code to
set_zone_contiguous(zone, ZONE_CONTIG_MAYBE) once IS_ALIGNED(end_pfn,
PAGES_PER_SECTION), but this part of code is still in the
mhp_init_memmap_on_memory(); If we want online_pages() to deal with this
corner case, then we might need to either pass the information
to online_pages() through additional parameter, or we need to have some
code in the online_pages() to get memory block information from pfn then
check the altmap, which seems not a good approach for the common code
path to deal with a very corner case in every call of online_pages().
Solution1:
mhp_init_memmap_on_memory(...)
{
…
/*
* It's a corner case where all the memory section will be online,
* thus the zone contiguous state needs to be populated.
*/
if (IS_ALIGNED(end_pfn, PAGES_PER_SECTION))
set_zone_contiguous(zone, ZONE_CONTIG_MAYBE);
}
Solution2.1:
online_pages(..., nr_vmemmap_pages)
{
...
If (nr_vmemmap_pages > 0)
set_zone_contiguous(zone, ZONE_CONTIG_MAYBE);
...
}
Solution2.2
oneline_pages(...)
{
...
struct memory_block *mem = pfn_to_memory_block(start_pfn);
if (mem->altmap && mem->altmap->free)
set_zone_contiguous(zone, ZONE_CONTIG_MAYBE);
...
}
Looking forward to hearing your advice. Thanks.
>> move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
>> false);
>> @@ -1113,7 +1180,7 @@ int mhp_init_memmap_on_memory(unsigned long
>> pfn, unsigned long nr_pages,
>> if (nr_pages >= PAGES_PER_SECTION)
>> online_mem_sections(pfn, ALIGN_DOWN(end_pfn,
>> PAGES_PER_SECTION));
>> - set_zone_contiguous(zone);
>> + set_zone_contiguous(zone, new_contiguous_state);
>> return ret;
>> }
>> @@ -1153,6 +1220,7 @@ int online_pages(unsigned long pfn, unsigned
>> long nr_pages,
>> const int nid = zone_to_nid(zone);
>> int need_zonelists_rebuild = 0;
>> unsigned long flags;
>> + enum zone_contig_state new_contiguous_state = ZONE_CONTIG_NO;
>> int ret;
>> /*
>> @@ -1166,6 +1234,7 @@ int online_pages(unsigned long pfn, unsigned
>> long nr_pages,
>> !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION)))
>> return -EINVAL;
>> + new_contiguous_state = zone_contig_state_after_growing(zone,
>> pfn, nr_pages);
>> /* associate pfn range with the zone */
>> move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_MOVABLE,
>> @@ -1204,7 +1273,7 @@ int online_pages(unsigned long pfn, unsigned
>> long nr_pages,
>> }
>> online_pages_range(pfn, nr_pages);
>> - set_zone_contiguous(zone);
>> + set_zone_contiguous(zone, new_contiguous_state);
>> adjust_present_page_count(pfn_to_page(pfn), group, nr_pages);
>> if (node_arg.nid >= 0)
>> diff --git a/mm/mm_init.c b/mm/mm_init.c
>> index fc2a6f1e518f..0c41f1004847 100644
>> --- a/mm/mm_init.c
>> +++ b/mm/mm_init.c
>> @@ -2263,11 +2263,19 @@ void __init init_cma_pageblock(struct page
>> *page)
>> }
>> #endif
>> -void set_zone_contiguous(struct zone *zone)
>> +void set_zone_contiguous(struct zone *zone, enum zone_contig_state
>> state)
>> {
>> unsigned long block_start_pfn = zone->zone_start_pfn;
>> unsigned long block_end_pfn;
>> + if (state == ZONE_CONTIG_YES) {
>> + zone->contiguous = true;
>> + return;
>> + }
>> +
>
> Maybe add a comment like
>
> /* We expect an earlier call to clear_zone_contig(). */
>
>
> And maybe move that comment all the way up in the function and add
>
> VM_WARN_ON_ONCE(zone->contiguous);
Will change accordingly. Thanks.
>
>> + if (state == ZONE_CONTIG_NO)
>> + return;
>> +
>> block_end_pfn = pageblock_end_pfn(block_start_pfn);
>> for (; block_start_pfn < zone_end_pfn(zone);
>> block_start_pfn = block_end_pfn,
>> @@ -2348,7 +2356,7 @@ void __init page_alloc_init_late(void)
>> shuffle_free_memory(NODE_DATA(nid));
>> for_each_populated_zone(zone)
>> - set_zone_contiguous(zone);
>> + set_zone_contiguous(zone, ZONE_CONTIG_MAYBE);
>> /* Initialize page ext after all struct pages are
>> initialized. */
>> if (deferred_struct_pages)
>
>
>> >> This is nasty. I would wish we could just leave that code path alone. >> >> In particular: I am 99% sure that we never ever run into this case in >> practice. >> >> E.g., on x86, we can have up to 2 GiB memory blocks. But the memmap of >> that is 64/4096*2GiB == 32 MB ... and a memory section is 128 MiB. >> >> >> As commented on patch #1, we should drop the set_zone_contiguous() in >> this function either way and let online_pages() deal with it. >> >> We just have to make sure that we don't create some inconsistencies by >> doing that. >> >> Can you double-check? I thought about this some more, and it's all a bit nasty. We have to get this right. Losing the optimization for memmap_on_memory users indicates that we are doing the wrong thing. You could introduce the set_zone_contiguous() in this patch here. But then, I think instead of + /* + * If the allocated memmap pages are not in a full section, keep the + * contiguous state as ZONE_CONTIG_NO. + */ + if (IS_ALIGNED(end_pfn, PAGES_PER_SECTION)) + new_contiguous_state = zone_contig_state_after_growing(zone, + pfn, nr_pages); + We'd actually unconditionally have to do that, no? -- Cheers David
Sorry for my delayed response. Appreciated for all your suggestions
David. My thoughts inlined for your kind consideration.
On 1/14/2026 7:13 PM, David Hildenbrand (Red Hat) wrote:
>>>
>>> This is nasty. I would wish we could just leave that code path alone.
>>>
>>> In particular: I am 99% sure that we never ever run into this case in
>>> practice.
>>>
>>> E.g., on x86, we can have up to 2 GiB memory blocks. But the memmap of
>>> that is 64/4096*2GiB == 32 MB ... and a memory section is 128 MiB.
>>>
>>>
>>> As commented on patch #1, we should drop the set_zone_contiguous() in
>>> this function either way and let online_pages() deal with it.
>>>
>>> We just have to make sure that we don't create some inconsistencies by
>>> doing that.
>>>
>>> Can you double-check?
>
> I thought about this some more, and it's all a bit nasty. We have to
> get this right.
>
> Losing the optimization for memmap_on_memory users indicates that we
> are doing the wrong thing.
>
> You could introduce the set_zone_contiguous() in this patch here. But
> then, I think instead of
>
> + /*
> + * If the allocated memmap pages are not in a full section, keep the
> + * contiguous state as ZONE_CONTIG_NO.
> + */
> + if (IS_ALIGNED(end_pfn, PAGES_PER_SECTION))
> + new_contiguous_state = zone_contig_state_after_growing(zone,
> + pfn, nr_pages);
> +
>
> We'd actually unconditionally have to do that, no?
>
That's a great idea! Probably we need to move
the adjust_present_page_count right after mhp_init_memmap_on_memory to
keep the present_pages in sync.
When the zone is contiguous previously, there probably 2 situations:
1. If new added range is at the start of the zone, then
after mhp_init_memmap_on_memory, the zone contiguous will be false at
fast path. It should be expected as long as the
adjust_present_page_count was called before the online_pages,
so zone_contig_state_after_growing in online_pages will return
ZONE_CONTIG_MAYBE, which is expected. Then the set_zone_contiguous in
online_pages will get correct result.
2. If new added range is at the end of the zone, then
the zone_contig_state_after_growing will return ZONE_CONTIG_YES or
ZONE_CONTIG_NO, regardless of the memory section online or not. When the
contiguous check comes into the online_pages, it will follow the fast
path and get the correct contiguous state.
When the zone is not contiguous previously, in any case the new zone
contiguous state will be false in mhp_init_memmap_on_memory, because
either the nr_vmemmap_pages can not fill the hole or the memory section
is not online. If we update the present_pages correctly, then in
online_pages, the zone_contig_state_after_growing could have the chance
to return ZONE_CONTIG_MAYBE, and since all memory sections are onlined,
the set_zone_contiguous will get the correct result.
I am not sure if you'd like to consider another option: could we
encapsulate the mhp_init_memmap_on_memory and online_pages into one
function eg. online_memory_block_pages, and offline_memory_block_pages
correspondingly as well, in the memory_hotplug.c. So we can check the
zone contiguous state as the whole for the new added range.
int online_memory_block_pages(unsigned long start_pfn, unsigned long
nr_pages,
unsigned long nr_vmemmap_pages, struct zone *zone,
struct memory_group *group)
{
bool contiguous = zone->contiguous;
enum zone_contig_state new_contiguous_state;
int ret;
/*
* Calculate the new zone contig state before move_pfn_range_to_zone()
* sets the zone temporarily to non-contiguous.
*/
new_contiguous_state = zone_contig_state_after_growing(zone, start_pfn,
nr_pages);
if (nr_vmemmap_pages) {
ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
zone);
if (ret)
goto restore_zone_contig;
}
ret = online_pages(start_pfn + nr_vmemmap_pages,
nr_pages - nr_vmemmap_pages, zone, group);
if (ret) {
if (nr_vmemmap_pages)
mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
goto restore_zone_contig;
}
/*
* Account once onlining succeeded. If the zone was unpopulated, it is
* now already properly populated.
*/
if (nr_vmemmap_pages)
adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
nr_vmemmap_pages);
/*
* Now that the ranges are indicated as online, check whether the whole
* zone is contiguous.
*/
set_zone_contiguous(zone, new_contiguous_state);
return 0;
restore_zone_contig:
zone->contiguous = contiguous;
return ret;
}
I am OK with either way, please let me know your preference. Thanks.
On 1/15/26 16:09, Li, Tianyou wrote:
> Sorry for my delayed response. Appreciated for all your suggestions
> David. My thoughts inlined for your kind consideration.
>
> On 1/14/2026 7:13 PM, David Hildenbrand (Red Hat) wrote:
>>>>
>>>> This is nasty. I would wish we could just leave that code path alone.
>>>>
>>>> In particular: I am 99% sure that we never ever run into this case in
>>>> practice.
>>>>
>>>> E.g., on x86, we can have up to 2 GiB memory blocks. But the memmap of
>>>> that is 64/4096*2GiB == 32 MB ... and a memory section is 128 MiB.
>>>>
>>>>
>>>> As commented on patch #1, we should drop the set_zone_contiguous() in
>>>> this function either way and let online_pages() deal with it.
>>>>
>>>> We just have to make sure that we don't create some inconsistencies by
>>>> doing that.
>>>>
>>>> Can you double-check?
>>
>> I thought about this some more, and it's all a bit nasty. We have to
>> get this right.
>>
>> Losing the optimization for memmap_on_memory users indicates that we
>> are doing the wrong thing.
>>
>> You could introduce the set_zone_contiguous() in this patch here. But
>> then, I think instead of
>>
>> + /*
>> + * If the allocated memmap pages are not in a full section, keep the
>> + * contiguous state as ZONE_CONTIG_NO.
>> + */
>> + if (IS_ALIGNED(end_pfn, PAGES_PER_SECTION))
>> + new_contiguous_state = zone_contig_state_after_growing(zone,
>> + pfn, nr_pages);
>> +
>>
>> We'd actually unconditionally have to do that, no?
>>
> That's a great idea! Probably we need to move
> the adjust_present_page_count right after mhp_init_memmap_on_memory to
> keep the present_pages in sync.
>
> When the zone is contiguous previously, there probably 2 situations:
>
> 1. If new added range is at the start of the zone, then
> after mhp_init_memmap_on_memory, the zone contiguous will be false at
> fast path. It should be expected as long as the
> adjust_present_page_count was called before the online_pages,
> so zone_contig_state_after_growing in online_pages will return
> ZONE_CONTIG_MAYBE, which is expected. Then the set_zone_contiguous in
> online_pages will get correct result.
>
> 2. If new added range is at the end of the zone, then
> the zone_contig_state_after_growing will return ZONE_CONTIG_YES or
> ZONE_CONTIG_NO, regardless of the memory section online or not. When the
> contiguous check comes into the online_pages, it will follow the fast
> path and get the correct contiguous state.
>
> When the zone is not contiguous previously, in any case the new zone
> contiguous state will be false in mhp_init_memmap_on_memory, because
> either the nr_vmemmap_pages can not fill the hole or the memory section
> is not online. If we update the present_pages correctly, then in
> online_pages, the zone_contig_state_after_growing could have the chance
> to return ZONE_CONTIG_MAYBE, and since all memory sections are onlined,
> the set_zone_contiguous will get the correct result.
>
>
> I am not sure if you'd like to consider another option: could we
> encapsulate the mhp_init_memmap_on_memory and online_pages into one
> function eg. online_memory_block_pages, and offline_memory_block_pages
> correspondingly as well, in the memory_hotplug.c. So we can check the
> zone contiguous state as the whole for the new added range.
>
> int online_memory_block_pages(unsigned long start_pfn, unsigned long
> nr_pages,
> unsigned long nr_vmemmap_pages, struct zone *zone,
> struct memory_group *group)
> {
> bool contiguous = zone->contiguous;
> enum zone_contig_state new_contiguous_state;
> int ret;
>
> /*
> * Calculate the new zone contig state before move_pfn_range_to_zone()
> * sets the zone temporarily to non-contiguous.
> */
> new_contiguous_state = zone_contig_state_after_growing(zone, start_pfn,
> nr_pages);
Should we clear the flag here?
>
> if (nr_vmemmap_pages) {
> ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
> zone);
> if (ret)
> goto restore_zone_contig;
> }
>
> ret = online_pages(start_pfn + nr_vmemmap_pages,
> nr_pages - nr_vmemmap_pages, zone, group);
> if (ret) {
> if (nr_vmemmap_pages)
> mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
> goto restore_zone_contig;
> }
>
> /*
> * Account once onlining succeeded. If the zone was unpopulated, it is
> * now already properly populated.
> */
> if (nr_vmemmap_pages)
> adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
> nr_vmemmap_pages);
>
> /*
> * Now that the ranges are indicated as online, check whether the whole
> * zone is contiguous.
> */
> set_zone_contiguous(zone, new_contiguous_state);
> return 0;
>
> restore_zone_contig:
> zone->contiguous = contiguous;
> return ret;
> }
That is even better, although it sucks to have to handle it on that
level, and that it's so far away from actual zone resizing code.
Should we do the same on the offlining path?
--
Cheers
David
On 1/16/2026 1:00 AM, David Hildenbrand (Red Hat) wrote:
> On 1/15/26 16:09, Li, Tianyou wrote:
>> Sorry for my delayed response. Appreciated for all your suggestions
>> David. My thoughts inlined for your kind consideration.
>>
>> On 1/14/2026 7:13 PM, David Hildenbrand (Red Hat) wrote:
>>>>>
>>>>> This is nasty. I would wish we could just leave that code path alone.
>>>>>
>>>>> In particular: I am 99% sure that we never ever run into this case in
>>>>> practice.
>>>>>
>>>>> E.g., on x86, we can have up to 2 GiB memory blocks. But the
>>>>> memmap of
>>>>> that is 64/4096*2GiB == 32 MB ... and a memory section is 128 MiB.
>>>>>
>>>>>
>>>>> As commented on patch #1, we should drop the set_zone_contiguous() in
>>>>> this function either way and let online_pages() deal with it.
>>>>>
>>>>> We just have to make sure that we don't create some
>>>>> inconsistencies by
>>>>> doing that.
>>>>>
>>>>> Can you double-check?
>>>
>>> I thought about this some more, and it's all a bit nasty. We have to
>>> get this right.
>>>
>>> Losing the optimization for memmap_on_memory users indicates that we
>>> are doing the wrong thing.
>>>
>>> You could introduce the set_zone_contiguous() in this patch here. But
>>> then, I think instead of
>>>
>>> + /*
>>> + * If the allocated memmap pages are not in a full section,
>>> keep the
>>> + * contiguous state as ZONE_CONTIG_NO.
>>> + */
>>> + if (IS_ALIGNED(end_pfn, PAGES_PER_SECTION))
>>> + new_contiguous_state = zone_contig_state_after_growing(zone,
>>> + pfn, nr_pages);
>>> +
>>>
>>> We'd actually unconditionally have to do that, no?
>>>
>> That's a great idea! Probably we need to move
>> the adjust_present_page_count right after mhp_init_memmap_on_memory to
>> keep the present_pages in sync.
>>
>> When the zone is contiguous previously, there probably 2 situations:
>>
>> 1. If new added range is at the start of the zone, then
>> after mhp_init_memmap_on_memory, the zone contiguous will be false at
>> fast path. It should be expected as long as the
>> adjust_present_page_count was called before the online_pages,
>> so zone_contig_state_after_growing in online_pages will return
>> ZONE_CONTIG_MAYBE, which is expected. Then the set_zone_contiguous in
>> online_pages will get correct result.
>>
>> 2. If new added range is at the end of the zone, then
>> the zone_contig_state_after_growing will return ZONE_CONTIG_YES or
>> ZONE_CONTIG_NO, regardless of the memory section online or not. When the
>> contiguous check comes into the online_pages, it will follow the fast
>> path and get the correct contiguous state.
>>
>> When the zone is not contiguous previously, in any case the new zone
>> contiguous state will be false in mhp_init_memmap_on_memory, because
>> either the nr_vmemmap_pages can not fill the hole or the memory section
>> is not online. If we update the present_pages correctly, then in
>> online_pages, the zone_contig_state_after_growing could have the chance
>> to return ZONE_CONTIG_MAYBE, and since all memory sections are onlined,
>> the set_zone_contiguous will get the correct result.
>>
>>
>> I am not sure if you'd like to consider another option: could we
>> encapsulate the mhp_init_memmap_on_memory and online_pages into one
>> function eg. online_memory_block_pages, and offline_memory_block_pages
>> correspondingly as well, in the memory_hotplug.c. So we can check the
>> zone contiguous state as the whole for the new added range.
>>
>> int online_memory_block_pages(unsigned long start_pfn, unsigned long
>> nr_pages,
>> unsigned long nr_vmemmap_pages, struct zone *zone,
>> struct memory_group *group)
>> {
>> bool contiguous = zone->contiguous;
>> enum zone_contig_state new_contiguous_state;
>> int ret;
>>
>> /*
>> * Calculate the new zone contig state before
>> move_pfn_range_to_zone()
>> * sets the zone temporarily to non-contiguous.
>> */
>> new_contiguous_state = zone_contig_state_after_growing(zone,
>> start_pfn,
>> nr_pages);
>
> Should we clear the flag here?
>
Yuan and I have quite a few debate here, the contiguous flag should be
clear once the zone changed, so it make sense that
move_pfn_range_to_zone and remove_pfn_range_from_zone coupled with
clear_zone_contiguous. While the set_zone_contiguous has the
dependencies with page online, so it should be carefully invoked after
page online or before page offline.
>>
>> if (nr_vmemmap_pages) {
>> ret = mhp_init_memmap_on_memory(start_pfn, nr_vmemmap_pages,
>> zone);
>> if (ret)
>> goto restore_zone_contig;
>> }
>>
>> ret = online_pages(start_pfn + nr_vmemmap_pages,
>> nr_pages - nr_vmemmap_pages, zone, group);
>> if (ret) {
>> if (nr_vmemmap_pages)
>> mhp_deinit_memmap_on_memory(start_pfn, nr_vmemmap_pages);
>> goto restore_zone_contig;
>> }
>>
>> /*
>> * Account once onlining succeeded. If the zone was
>> unpopulated, it is
>> * now already properly populated.
>> */
>> if (nr_vmemmap_pages)
>> adjust_present_page_count(pfn_to_page(start_pfn), mem->group,
>> nr_vmemmap_pages);
>>
>> /*
>> * Now that the ranges are indicated as online, check whether
>> the whole
>> * zone is contiguous.
>> */
>> set_zone_contiguous(zone, new_contiguous_state);
>> return 0;
>>
>> restore_zone_contig:
>> zone->contiguous = contiguous;
>> return ret;
>> }
>
> That is even better, although it sucks to have to handle it on that
> level, and that it's so far away from actual zone resizing code.
>
Agreed. Thanks for the confirmation.
> Should we do the same on the offlining path?
>
Yes, definitely. I will work on this change for patch v8. I will post
the patch v8 once tested and confirmed by Yuan.
Regards,
Tianyou
© 2016 - 2026 Red Hat, Inc.