[PATCH 2/2] arm64/mm: Reject memory removal that splits a kernel leaf mapping

Anshuman Khandual posted 2 patches 5 days, 1 hour ago
There is a newer version of this series
[PATCH 2/2] arm64/mm: Reject memory removal that splits a kernel leaf mapping
Posted by Anshuman Khandual 5 days, 1 hour ago
Linear and vmemmap mapings that get teared down during a memory hot remove
operation might contain leaf level entries on any page table level. If the
requested memory range's linear or vmemmap mappings falls within such leaf
entries, new mappings need to be created for the remaning memory mapped on
the leaf entry earlier, following standard break before make aka BBM rules.

Currently memory hot remove operation does not perform such restructuring,
and so removing memory ranges that could split a kernel leaf level mapping
need to be rejected.

Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
Closes: https://lore.kernel.org/all/aWZYXhrT6D2M-7-N@willie-the-truck/
Fixes: bbd6ec605c0f ("arm64/mm: Enable memory hot remove")
Cc: stable@vger.kernel.org
Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
---
 arch/arm64/mm/mmu.c | 126 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8ec8a287aaa1..9d59e10fb3de 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -2063,6 +2063,129 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
 	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
 }
 
+
+static bool split_kernel_leaf_boundary(unsigned long addr)
+{
+	pgd_t *pgdp, pgd;
+	p4d_t *p4dp, p4d;
+	pud_t *pudp, pud;
+	pmd_t *pmdp, pmd;
+	pte_t *ptep, pte;
+
+	/*
+	 * PGD: If addr is PGD aligned then addr already
+	 * describes a leaf boundary.
+	 */
+	if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr)
+		return false;
+
+	pgdp = pgd_offset_k(addr);
+	pgd = pgdp_get(pgdp);
+	if (!pgd_present(pgd))
+		return false;
+
+	/*
+	 * P4D: If addr is P4D aligned then addr already
+	 * describes a leaf boundary.
+	 */
+	if (ALIGN_DOWN(addr, P4D_SIZE) == addr)
+		return false;
+
+	p4dp = p4d_offset(pgdp, addr);
+	p4d = p4dp_get(p4dp);
+	if (!p4d_present(p4d))
+		return false;
+
+	/*
+	 * PUD: If addr is PUD aligned then addr already
+	 * describes a leaf boundary.
+	 */
+	if (ALIGN_DOWN(addr, PUD_SIZE) == addr)
+		return false;
+
+	pudp = pud_offset(p4dp, addr);
+	pud = pudp_get(pudp);
+	if (!pud_present(pud))
+		return false;
+
+	if (pud_leaf(pud))
+		return true;
+
+	/*
+	 * CONT_PMD: If addr is CONT_PMD aligned then
+	 * addr already describes a leaf boundary.
+	 */
+	if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr)
+		return false;
+
+	pmdp = pmd_offset(pudp, addr);
+	pmd = pmdp_get(pmdp);
+	if (!pmd_present(pmd))
+		return false;
+
+	if (pmd_leaf(pmd) && pmd_cont(pmd))
+		return true;
+
+	/*
+	 * PMD: If addr is PMD aligned then addr already
+	 * describes a leaf boundary.
+	 */
+	if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
+		return false;
+
+	if (pmd_leaf(pmd))
+		return true;
+
+	/*
+	 * CONT_PTE: If addr is CONT_PTE aligned then addr
+	 * already describes a leaf boundary.
+	 */
+	if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr)
+		return false;
+
+	ptep = pte_offset_kernel(pmdp, addr);
+	pte = __ptep_get(ptep);
+	if (!pte_present(pte))
+		return false;
+
+	if (pte_valid(pte) && pte_cont(pte))
+		return true;
+
+	if (ALIGN_DOWN(addr, PAGE_SIZE) == addr)
+		return false;
+	return true;
+}
+
+static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long linear_start, linear_end, phys_start, phys_end;
+	unsigned long vmemmap_size, vmemmap_start, vmemmap_end;
+
+	/* Assert linear map edges do not split a leaf entry */
+	phys_start = PFN_PHYS(pfn);
+	phys_end = phys_start + nr_pages * PAGE_SIZE;
+	linear_start = __phys_to_virt(phys_start);
+	linear_end =  __phys_to_virt(phys_end);
+	if (split_kernel_leaf_boundary(linear_start) ||
+	    split_kernel_leaf_boundary(linear_end)) {
+		pr_warn("[%lx %lx] splits a leaf entry in linear map\n",
+			phys_start, phys_end);
+		return false;
+	}
+
+	/* Assert vmemmap edges do not split a leaf entry */
+	vmemmap_size = nr_pages * sizeof(struct page);
+	vmemmap_start = (unsigned long) pfn_to_page(pfn);
+	vmemmap_end = vmemmap_start + vmemmap_size;
+	if (split_kernel_leaf_boundary(vmemmap_start) ||
+	    split_kernel_leaf_boundary(vmemmap_end)) {
+		pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n",
+			phys_start, phys_end);
+		return false;
+	}
+	return true;
+}
+
 /*
  * This memory hotplug notifier helps prevent boot memory from being
  * inadvertently removed as it blocks pfn range offlining process in
@@ -2083,6 +2206,9 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
 	if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
 		return NOTIFY_OK;
 
+	if (!can_unmap_without_split(pfn, arg->nr_pages))
+		return NOTIFY_BAD;
+
 	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 		unsigned long start = PFN_PHYS(pfn);
 		unsigned long end = start + (1UL << PA_SECTION_SHIFT);
-- 
2.30.2
Re: [PATCH 2/2] arm64/mm: Reject memory removal that splits a kernel leaf mapping
Posted by Ryan Roberts 4 days, 20 hours ago
On 02/02/2026 04:26, Anshuman Khandual wrote:
> Linear and vmemmap mapings that get teared down during a memory hot remove
> operation might contain leaf level entries on any page table level. If the
> requested memory range's linear or vmemmap mappings falls within such leaf
> entries, new mappings need to be created for the remaning memory mapped on
> the leaf entry earlier, following standard break before make aka BBM rules.

I think it would be good to mention that the kernel cannot tolerate BBM so
remapping to fine grained leaves would not be possible on systems without
BBML2_NOABORT.

> 
> Currently memory hot remove operation does not perform such restructuring,
> and so removing memory ranges that could split a kernel leaf level mapping
> need to be rejected.

Perhaps it is useful to mention that while memory_hotplug.c does appear to
permit hot-unplugging arbitrary ranges of memory, the higher layers that drive
memory_hotplug (e.g. ACPI, virtio, ...) all appear to treat memory as fixed size
devices so it is impossible to hotunplug a different amount than was previously
hotplugged, and so we should never see a rejection in practice, but adding the
check makes us robust against a future change.

> 
> Cc: Catalin Marinas <catalin.marinas@arm.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: linux-arm-kernel@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> Closes: https://lore.kernel.org/all/aWZYXhrT6D2M-7-N@willie-the-truck/
> Fixes: bbd6ec605c0f ("arm64/mm: Enable memory hot remove")
> Cc: stable@vger.kernel.org
> Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
> ---
>  arch/arm64/mm/mmu.c | 126 ++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 126 insertions(+)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 8ec8a287aaa1..9d59e10fb3de 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -2063,6 +2063,129 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
>  	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
>  }
>  
> +
> +static bool split_kernel_leaf_boundary(unsigned long addr)

The name currently makes it sound like we are asking for the mapping to be split
(we have existing functions to do this that are named similarly). Perhaps a
better name would be addr_splits_leaf()?

> +{
> +	pgd_t *pgdp, pgd;
> +	p4d_t *p4dp, p4d;
> +	pud_t *pudp, pud;
> +	pmd_t *pmdp, pmd;
> +	pte_t *ptep, pte;
> +
> +	/*
> +	 * PGD: If addr is PGD aligned then addr already
> +	 * describes a leaf boundary.
> +	 */
> +	if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr)
> +		return false;
> +
> +	pgdp = pgd_offset_k(addr);
> +	pgd = pgdp_get(pgdp);
> +	if (!pgd_present(pgd))
> +		return false;
> +
> +	/*
> +	 * P4D: If addr is P4D aligned then addr already
> +	 * describes a leaf boundary.
> +	 */
> +	if (ALIGN_DOWN(addr, P4D_SIZE) == addr)
> +		return false;
> +
> +	p4dp = p4d_offset(pgdp, addr);
> +	p4d = p4dp_get(p4dp);
> +	if (!p4d_present(p4d))
> +		return false;
> +
> +	/*
> +	 * PUD: If addr is PUD aligned then addr already
> +	 * describes a leaf boundary.
> +	 */
> +	if (ALIGN_DOWN(addr, PUD_SIZE) == addr)
> +		return false;
> +
> +	pudp = pud_offset(p4dp, addr);
> +	pud = pudp_get(pudp);
> +	if (!pud_present(pud))
> +		return false;
> +
> +	if (pud_leaf(pud))
> +		return true;
> +
> +	/*
> +	 * CONT_PMD: If addr is CONT_PMD aligned then
> +	 * addr already describes a leaf boundary.
> +	 */
> +	if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr)
> +		return false;
> +
> +	pmdp = pmd_offset(pudp, addr);
> +	pmd = pmdp_get(pmdp);
> +	if (!pmd_present(pmd))
> +		return false;
> +
> +	if (pmd_leaf(pmd) && pmd_cont(pmd))
> +		return true;
> +
> +	/*
> +	 * PMD: If addr is PMD aligned then addr already
> +	 * describes a leaf boundary.
> +	 */
> +	if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
> +		return false;
> +
> +	if (pmd_leaf(pmd))
> +		return true;
> +
> +	/*
> +	 * CONT_PTE: If addr is CONT_PTE aligned then addr
> +	 * already describes a leaf boundary.
> +	 */
> +	if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr)
> +		return false;
> +
> +	ptep = pte_offset_kernel(pmdp, addr);
> +	pte = __ptep_get(ptep);
> +	if (!pte_present(pte))
> +		return false;
> +
> +	if (pte_valid(pte) && pte_cont(pte))

Why do you need pte_valid() here? You have already checked !pte_present(). Are
you expecting a case of present but not valid (PTE_PRESENT_INVALID)? If so, do
you need to consider that for the other levels too? (pmd_leaf() only checks
pmd_present()).

Personally I think you can just drop the pte_valid() check here.

> +		return true;
> +
> +	if (ALIGN_DOWN(addr, PAGE_SIZE) == addr)
> +		return false;
> +	return true;
> +}
> +
> +static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages)
> +{
> +	unsigned long linear_start, linear_end, phys_start, phys_end;
> +	unsigned long vmemmap_size, vmemmap_start, vmemmap_end;

nit: do we need all these variables. Perhaps just:

unsigned long sz, start, end, phys_start, phys_end;

are sufficient?

> +
> +	/* Assert linear map edges do not split a leaf entry */
> +	phys_start = PFN_PHYS(pfn);
> +	phys_end = phys_start + nr_pages * PAGE_SIZE;
> +	linear_start = __phys_to_virt(phys_start);
> +	linear_end =  __phys_to_virt(phys_end);
> +	if (split_kernel_leaf_boundary(linear_start) ||
> +	    split_kernel_leaf_boundary(linear_end)) {
> +		pr_warn("[%lx %lx] splits a leaf entry in linear map\n",
> +			phys_start, phys_end);
> +		return false;
> +	}
> +
> +	/* Assert vmemmap edges do not split a leaf entry */
> +	vmemmap_size = nr_pages * sizeof(struct page);
> +	vmemmap_start = (unsigned long) pfn_to_page(pfn);

nit:                                   ^

I don't think we would normally have that space?

> +	vmemmap_end = vmemmap_start + vmemmap_size;
> +	if (split_kernel_leaf_boundary(vmemmap_start) ||
> +	    split_kernel_leaf_boundary(vmemmap_end)) {
> +		pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n",
> +			phys_start, phys_end);
> +		return false;
> +	}
> +	return true;
> +}
> +
>  /*
>   * This memory hotplug notifier helps prevent boot memory from being
>   * inadvertently removed as it blocks pfn range offlining process in
> @@ -2083,6 +2206,9 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
>  	if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
>  		return NOTIFY_OK;
>  
> +	if (!can_unmap_without_split(pfn, arg->nr_pages))
> +		return NOTIFY_BAD;
> +

Personally, I'd keep the bootmem check first and do this check after. That means
an existing warning will not change.

Thanks,
Ryan

>  	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
>  		unsigned long start = PFN_PHYS(pfn);
>  		unsigned long end = start + (1UL << PA_SECTION_SHIFT);
Re: [PATCH 2/2] arm64/mm: Reject memory removal that splits a kernel leaf mapping
Posted by Anshuman Khandual 4 days, 18 hours ago
On 02/02/26 3:12 PM, Ryan Roberts wrote:
> On 02/02/2026 04:26, Anshuman Khandual wrote:
>> Linear and vmemmap mapings that get teared down during a memory hot remove
>> operation might contain leaf level entries on any page table level. If the
>> requested memory range's linear or vmemmap mappings falls within such leaf
>> entries, new mappings need to be created for the remaning memory mapped on
>> the leaf entry earlier, following standard break before make aka BBM rules.
> 
> I think it would be good to mention that the kernel cannot tolerate BBM so
> remapping to fine grained leaves would not be possible on systems without
> BBML2_NOABORT.

Sure will add that.

> 
>>
>> Currently memory hot remove operation does not perform such restructuring,
>> and so removing memory ranges that could split a kernel leaf level mapping
>> need to be rejected.
> 
> Perhaps it is useful to mention that while memory_hotplug.c does appear to
> permit hot-unplugging arbitrary ranges of memory, the higher layers that drive
> memory_hotplug (e.g. ACPI, virtio, ...) all appear to treat memory as fixed size
> devices so it is impossible to hotunplug a different amount than was previously
> hotplugged, and so we should never see a rejection in practice, but adding the
> check makes us robust against a future change.

Agreed, will update the commit message.

> 
>>
>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>> Cc: Will Deacon <will@kernel.org>
>> Cc: linux-arm-kernel@lists.infradead.org
>> Cc: linux-kernel@vger.kernel.org
>> Closes: https://lore.kernel.org/all/aWZYXhrT6D2M-7-N@willie-the-truck/
>> Fixes: bbd6ec605c0f ("arm64/mm: Enable memory hot remove")
>> Cc: stable@vger.kernel.org
>> Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>> ---
>>  arch/arm64/mm/mmu.c | 126 ++++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 126 insertions(+)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 8ec8a287aaa1..9d59e10fb3de 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -2063,6 +2063,129 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
>>  	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
>>  }
>>  
>> +
>> +static bool split_kernel_leaf_boundary(unsigned long addr)
> 
> The name currently makes it sound like we are asking for the mapping to be split
> (we have existing functions to do this that are named similarly). Perhaps a
> better name would be addr_splits_leaf()?

Agreed that name sounds bit confusing and ambiguous as there is already
a similarly named function. Will rename it as addr_splits_kernel_leaf()
instead.

> 
>> +{
>> +	pgd_t *pgdp, pgd;
>> +	p4d_t *p4dp, p4d;
>> +	pud_t *pudp, pud;
>> +	pmd_t *pmdp, pmd;
>> +	pte_t *ptep, pte;
>> +
>> +	/*
>> +	 * PGD: If addr is PGD aligned then addr already
>> +	 * describes a leaf boundary.
>> +	 */
>> +	if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr)
>> +		return false;
>> +
>> +	pgdp = pgd_offset_k(addr);
>> +	pgd = pgdp_get(pgdp);
>> +	if (!pgd_present(pgd))
>> +		return false;
>> +
>> +	/*
>> +	 * P4D: If addr is P4D aligned then addr already
>> +	 * describes a leaf boundary.
>> +	 */
>> +	if (ALIGN_DOWN(addr, P4D_SIZE) == addr)
>> +		return false;
>> +
>> +	p4dp = p4d_offset(pgdp, addr);
>> +	p4d = p4dp_get(p4dp);
>> +	if (!p4d_present(p4d))
>> +		return false;
>> +
>> +	/*
>> +	 * PUD: If addr is PUD aligned then addr already
>> +	 * describes a leaf boundary.
>> +	 */
>> +	if (ALIGN_DOWN(addr, PUD_SIZE) == addr)
>> +		return false;
>> +
>> +	pudp = pud_offset(p4dp, addr);
>> +	pud = pudp_get(pudp);
>> +	if (!pud_present(pud))
>> +		return false;
>> +
>> +	if (pud_leaf(pud))
>> +		return true;
>> +
>> +	/*
>> +	 * CONT_PMD: If addr is CONT_PMD aligned then
>> +	 * addr already describes a leaf boundary.
>> +	 */
>> +	if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr)
>> +		return false;
>> +
>> +	pmdp = pmd_offset(pudp, addr);
>> +	pmd = pmdp_get(pmdp);
>> +	if (!pmd_present(pmd))
>> +		return false;
>> +
>> +	if (pmd_leaf(pmd) && pmd_cont(pmd))
>> +		return true;
>> +
>> +	/*
>> +	 * PMD: If addr is PMD aligned then addr already
>> +	 * describes a leaf boundary.
>> +	 */
>> +	if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
>> +		return false;
>> +
>> +	if (pmd_leaf(pmd))
>> +		return true;
>> +
>> +	/*
>> +	 * CONT_PTE: If addr is CONT_PTE aligned then addr
>> +	 * already describes a leaf boundary.
>> +	 */
>> +	if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr)
>> +		return false;
>> +
>> +	ptep = pte_offset_kernel(pmdp, addr);
>> +	pte = __ptep_get(ptep);
>> +	if (!pte_present(pte))
>> +		return false;
>> +
>> +	if (pte_valid(pte) && pte_cont(pte))
> 
> Why do you need pte_valid() here? You have already checked !pte_present(). Are
> you expecting a case of present but not valid (PTE_PRESENT_INVALID)? If so, do
> you need to consider that for the other levels too? (pmd_leaf() only checks
> pmd_present()).
> > Personally I think you can just drop the pte_valid() check here.

Added pte_valid() for abundance of caution but it is not really
necessary though. Sure will drop it off.

> 
>> +		return true;
>> +
>> +	if (ALIGN_DOWN(addr, PAGE_SIZE) == addr)
>> +		return false;
>> +	return true;
>> +}
>> +
>> +static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages)
>> +{
>> +	unsigned long linear_start, linear_end, phys_start, phys_end;
>> +	unsigned long vmemmap_size, vmemmap_start, vmemmap_end;
> 
> nit: do we need all these variables. Perhaps just:
> 
> unsigned long sz, start, end, phys_start, phys_end;
> 
> are sufficient?

Alright. I guess start and end can be re-used both for linear and
vmemmap mapping.

> 
>> +
>> +	/* Assert linear map edges do not split a leaf entry */
>> +	phys_start = PFN_PHYS(pfn);
>> +	phys_end = phys_start + nr_pages * PAGE_SIZE;
>> +	linear_start = __phys_to_virt(phys_start);
>> +	linear_end =  __phys_to_virt(phys_end);
>> +	if (split_kernel_leaf_boundary(linear_start) ||
>> +	    split_kernel_leaf_boundary(linear_end)) {
>> +		pr_warn("[%lx %lx] splits a leaf entry in linear map\n",
>> +			phys_start, phys_end);
>> +		return false;
>> +	}
>> +
>> +	/* Assert vmemmap edges do not split a leaf entry */
>> +	vmemmap_size = nr_pages * sizeof(struct page);
>> +	vmemmap_start = (unsigned long) pfn_to_page(pfn);
> 
> nit:                                   ^
> 
> I don't think we would normally have that space?

Sure will drop that.

> 
>> +	vmemmap_end = vmemmap_start + vmemmap_size;
>> +	if (split_kernel_leaf_boundary(vmemmap_start) ||
>> +	    split_kernel_leaf_boundary(vmemmap_end)) {
>> +		pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n",
>> +			phys_start, phys_end);
>> +		return false;
>> +	}
>> +	return true;
>> +}
>> +
>>  /*
>>   * This memory hotplug notifier helps prevent boot memory from being
>>   * inadvertently removed as it blocks pfn range offlining process in
>> @@ -2083,6 +2206,9 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
>>  	if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
>>  		return NOTIFY_OK;
>>  
>> +	if (!can_unmap_without_split(pfn, arg->nr_pages))
>> +		return NOTIFY_BAD;
>> +
> 
> Personally, I'd keep the bootmem check first and do this check after. That means
> an existing warning will not change.

Makes sense, will move it after existing bootmem check. BTW the function
still named as prevent_bootmem_remove_notifier() although now it's going
to check leaf boundaries as well. Should the function be renamed as well
to something more generic e.g prevent_memory_remove_notifier() ?

> 
> Thanks,
> Ryan
> 
>>  	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
>>  		unsigned long start = PFN_PHYS(pfn);
>>  		unsigned long end = start + (1UL << PA_SECTION_SHIFT);
>
Re: [PATCH 2/2] arm64/mm: Reject memory removal that splits a kernel leaf mapping
Posted by Ryan Roberts 4 days, 18 hours ago
On 02/02/2026 11:06, Anshuman Khandual wrote:
> On 02/02/26 3:12 PM, Ryan Roberts wrote:
>> On 02/02/2026 04:26, Anshuman Khandual wrote:
>>> Linear and vmemmap mapings that get teared down during a memory hot remove
>>> operation might contain leaf level entries on any page table level. If the
>>> requested memory range's linear or vmemmap mappings falls within such leaf
>>> entries, new mappings need to be created for the remaning memory mapped on
>>> the leaf entry earlier, following standard break before make aka BBM rules.
>>
>> I think it would be good to mention that the kernel cannot tolerate BBM so
>> remapping to fine grained leaves would not be possible on systems without
>> BBML2_NOABORT.
> 
> Sure will add that.
> 
>>
>>>
>>> Currently memory hot remove operation does not perform such restructuring,
>>> and so removing memory ranges that could split a kernel leaf level mapping
>>> need to be rejected.
>>
>> Perhaps it is useful to mention that while memory_hotplug.c does appear to
>> permit hot-unplugging arbitrary ranges of memory, the higher layers that drive
>> memory_hotplug (e.g. ACPI, virtio, ...) all appear to treat memory as fixed size
>> devices so it is impossible to hotunplug a different amount than was previously
>> hotplugged, and so we should never see a rejection in practice, but adding the
>> check makes us robust against a future change.
> 
> Agreed, will update the commit message.
> 
>>
>>>
>>> Cc: Catalin Marinas <catalin.marinas@arm.com>
>>> Cc: Will Deacon <will@kernel.org>
>>> Cc: linux-arm-kernel@lists.infradead.org
>>> Cc: linux-kernel@vger.kernel.org
>>> Closes: https://lore.kernel.org/all/aWZYXhrT6D2M-7-N@willie-the-truck/
>>> Fixes: bbd6ec605c0f ("arm64/mm: Enable memory hot remove")
>>> Cc: stable@vger.kernel.org
>>> Suggested-by: Ryan Roberts <ryan.roberts@arm.com>
>>> Signed-off-by: Anshuman Khandual <anshuman.khandual@arm.com>
>>> ---
>>>  arch/arm64/mm/mmu.c | 126 ++++++++++++++++++++++++++++++++++++++++++++
>>>  1 file changed, 126 insertions(+)
>>>
>>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>>> index 8ec8a287aaa1..9d59e10fb3de 100644
>>> --- a/arch/arm64/mm/mmu.c
>>> +++ b/arch/arm64/mm/mmu.c
>>> @@ -2063,6 +2063,129 @@ void arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
>>>  	__remove_pgd_mapping(swapper_pg_dir, __phys_to_virt(start), size);
>>>  }
>>>  
>>> +
>>> +static bool split_kernel_leaf_boundary(unsigned long addr)
>>
>> The name currently makes it sound like we are asking for the mapping to be split
>> (we have existing functions to do this that are named similarly). Perhaps a
>> better name would be addr_splits_leaf()?
> 
> Agreed that name sounds bit confusing and ambiguous as there is already
> a similarly named function. Will rename it as addr_splits_kernel_leaf()
> instead.
> 
>>
>>> +{
>>> +	pgd_t *pgdp, pgd;
>>> +	p4d_t *p4dp, p4d;
>>> +	pud_t *pudp, pud;
>>> +	pmd_t *pmdp, pmd;
>>> +	pte_t *ptep, pte;
>>> +
>>> +	/*
>>> +	 * PGD: If addr is PGD aligned then addr already
>>> +	 * describes a leaf boundary.
>>> +	 */
>>> +	if (ALIGN_DOWN(addr, PGDIR_SIZE) == addr)
>>> +		return false;
>>> +
>>> +	pgdp = pgd_offset_k(addr);
>>> +	pgd = pgdp_get(pgdp);
>>> +	if (!pgd_present(pgd))
>>> +		return false;
>>> +
>>> +	/*
>>> +	 * P4D: If addr is P4D aligned then addr already
>>> +	 * describes a leaf boundary.
>>> +	 */
>>> +	if (ALIGN_DOWN(addr, P4D_SIZE) == addr)
>>> +		return false;
>>> +
>>> +	p4dp = p4d_offset(pgdp, addr);
>>> +	p4d = p4dp_get(p4dp);
>>> +	if (!p4d_present(p4d))
>>> +		return false;
>>> +
>>> +	/*
>>> +	 * PUD: If addr is PUD aligned then addr already
>>> +	 * describes a leaf boundary.
>>> +	 */
>>> +	if (ALIGN_DOWN(addr, PUD_SIZE) == addr)
>>> +		return false;
>>> +
>>> +	pudp = pud_offset(p4dp, addr);
>>> +	pud = pudp_get(pudp);
>>> +	if (!pud_present(pud))
>>> +		return false;
>>> +
>>> +	if (pud_leaf(pud))
>>> +		return true;
>>> +
>>> +	/*
>>> +	 * CONT_PMD: If addr is CONT_PMD aligned then
>>> +	 * addr already describes a leaf boundary.
>>> +	 */
>>> +	if (ALIGN_DOWN(addr, CONT_PMD_SIZE) == addr)
>>> +		return false;
>>> +
>>> +	pmdp = pmd_offset(pudp, addr);
>>> +	pmd = pmdp_get(pmdp);
>>> +	if (!pmd_present(pmd))
>>> +		return false;
>>> +
>>> +	if (pmd_leaf(pmd) && pmd_cont(pmd))
>>> +		return true;
>>> +
>>> +	/*
>>> +	 * PMD: If addr is PMD aligned then addr already
>>> +	 * describes a leaf boundary.
>>> +	 */
>>> +	if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
>>> +		return false;
>>> +
>>> +	if (pmd_leaf(pmd))
>>> +		return true;
>>> +
>>> +	/*
>>> +	 * CONT_PTE: If addr is CONT_PTE aligned then addr
>>> +	 * already describes a leaf boundary.
>>> +	 */
>>> +	if (ALIGN_DOWN(addr, CONT_PTE_SIZE) == addr)
>>> +		return false;
>>> +
>>> +	ptep = pte_offset_kernel(pmdp, addr);
>>> +	pte = __ptep_get(ptep);
>>> +	if (!pte_present(pte))
>>> +		return false;
>>> +
>>> +	if (pte_valid(pte) && pte_cont(pte))
>>
>> Why do you need pte_valid() here? You have already checked !pte_present(). Are
>> you expecting a case of present but not valid (PTE_PRESENT_INVALID)? If so, do
>> you need to consider that for the other levels too? (pmd_leaf() only checks
>> pmd_present()).
>>> Personally I think you can just drop the pte_valid() check here.
> 
> Added pte_valid() for abundance of caution but it is not really
> necessary though. Sure will drop it off.
> 
>>
>>> +		return true;
>>> +
>>> +	if (ALIGN_DOWN(addr, PAGE_SIZE) == addr)
>>> +		return false;
>>> +	return true;
>>> +}
>>> +
>>> +static bool can_unmap_without_split(unsigned long pfn, unsigned long nr_pages)
>>> +{
>>> +	unsigned long linear_start, linear_end, phys_start, phys_end;
>>> +	unsigned long vmemmap_size, vmemmap_start, vmemmap_end;
>>
>> nit: do we need all these variables. Perhaps just:
>>
>> unsigned long sz, start, end, phys_start, phys_end;
>>
>> are sufficient?
> 
> Alright. I guess start and end can be re-used both for linear and
> vmemmap mapping.
> 
>>
>>> +
>>> +	/* Assert linear map edges do not split a leaf entry */
>>> +	phys_start = PFN_PHYS(pfn);
>>> +	phys_end = phys_start + nr_pages * PAGE_SIZE;
>>> +	linear_start = __phys_to_virt(phys_start);
>>> +	linear_end =  __phys_to_virt(phys_end);
>>> +	if (split_kernel_leaf_boundary(linear_start) ||
>>> +	    split_kernel_leaf_boundary(linear_end)) {
>>> +		pr_warn("[%lx %lx] splits a leaf entry in linear map\n",
>>> +			phys_start, phys_end);
>>> +		return false;
>>> +	}
>>> +
>>> +	/* Assert vmemmap edges do not split a leaf entry */
>>> +	vmemmap_size = nr_pages * sizeof(struct page);
>>> +	vmemmap_start = (unsigned long) pfn_to_page(pfn);
>>
>> nit:                                   ^
>>
>> I don't think we would normally have that space?
> 
> Sure will drop that.
> 
>>
>>> +	vmemmap_end = vmemmap_start + vmemmap_size;
>>> +	if (split_kernel_leaf_boundary(vmemmap_start) ||
>>> +	    split_kernel_leaf_boundary(vmemmap_end)) {
>>> +		pr_warn("[%lx %lx] splits a leaf entry in vmemmap\n",
>>> +			phys_start, phys_end);
>>> +		return false;
>>> +	}
>>> +	return true;
>>> +}
>>> +
>>>  /*
>>>   * This memory hotplug notifier helps prevent boot memory from being
>>>   * inadvertently removed as it blocks pfn range offlining process in
>>> @@ -2083,6 +2206,9 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb,
>>>  	if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE))
>>>  		return NOTIFY_OK;
>>>  
>>> +	if (!can_unmap_without_split(pfn, arg->nr_pages))
>>> +		return NOTIFY_BAD;
>>> +
>>
>> Personally, I'd keep the bootmem check first and do this check after. That means
>> an existing warning will not change.
> 
> Makes sense, will move it after existing bootmem check. BTW the function
> still named as prevent_bootmem_remove_notifier() although now it's going
> to check leaf boundaries as well. Should the function be renamed as well
> to something more generic e.g prevent_memory_remove_notifier() ?

Works for me.

> 
>>
>> Thanks,
>> Ryan
>>
>>>  	for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
>>>  		unsigned long start = PFN_PHYS(pfn);
>>>  		unsigned long end = start + (1UL << PA_SECTION_SHIFT);
>>
>