From: Björn Töpel <bjorn@rivosinc.com>
For an architecture to support memory hotplugging, a couple of
callbacks needs to be implemented:
arch_add_memory()
This callback is responsible for adding the physical memory into the
direct map, and call into the memory hotplugging generic code via
__add_pages() that adds the corresponding struct page entries, and
updates the vmemmap mapping.
arch_remove_memory()
This is the inverse of the callback above.
vmemmap_free()
This function tears down the vmemmap mappings (if
CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
backing vmemmap pages. Note that for persistent memory, an
alternative allocator for the backing pages can be used; The
vmem_altmap. This means that when the backing pages are cleared,
extra care is needed so that the correct deallocation method is
used.
arch_get_mappable_range()
This functions returns the PA range that the direct map can map.
Used by the MHP internals for sanity checks.
The page table unmap/teardown functions are heavily based on code from
the x86 tree. The same remove_pgd_mapping() function is used in both
vmemmap_free() and arch_remove_memory(), but in the latter function
the backing pages are not removed.
Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
---
arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++
1 file changed, 242 insertions(+)
diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
index 6f72b0b2b854..7f0b921a3d3a 100644
--- a/arch/riscv/mm/init.c
+++ b/arch/riscv/mm/init.c
@@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void)
}
}
#endif
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
+{
+ pte_t *pte;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PTE; i++) {
+ pte = pte_start + i;
+ if (!pte_none(*pte))
+ return;
+ }
+
+ free_pages((unsigned long)page_address(pmd_page(*pmd)), 0);
+ pmd_clear(pmd);
+}
+
+static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
+{
+ pmd_t *pmd;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd = pmd_start + i;
+ if (!pmd_none(*pmd))
+ return;
+ }
+
+ free_pages((unsigned long)page_address(pud_page(*pud)), 0);
+ pud_clear(pud);
+}
+
+static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
+{
+ pud_t *pud;
+ int i;
+
+ for (i = 0; i < PTRS_PER_PUD; i++) {
+ pud = pud_start + i;
+ if (!pud_none(*pud))
+ return;
+ }
+
+ free_pages((unsigned long)page_address(p4d_page(*p4d)), 0);
+ p4d_clear(p4d);
+}
+
+static void __meminit free_vmemmap_storage(struct page *page, size_t size,
+ struct vmem_altmap *altmap)
+{
+ if (altmap)
+ vmem_altmap_free(altmap, size >> PAGE_SHIFT);
+ else
+ free_pages((unsigned long)page_address(page), get_order(size));
+}
+
+static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end,
+ bool is_vmemmap, struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ pte_t *ptep, pte;
+
+ for (; addr < end; addr = next) {
+ next = (addr + PAGE_SIZE) & PAGE_MASK;
+ if (next > end)
+ next = end;
+
+ ptep = pte_base + pte_index(addr);
+ pte = READ_ONCE(*ptep);
+
+ if (!pte_present(*ptep))
+ continue;
+
+ pte_clear(&init_mm, addr, ptep);
+ if (is_vmemmap)
+ free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
+ }
+}
+
+static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end,
+ bool is_vmemmap, struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ pte_t *pte_base;
+ pmd_t *pmdp, pmd;
+
+ for (; addr < end; addr = next) {
+ next = pmd_addr_end(addr, end);
+ pmdp = pmd_base + pmd_index(addr);
+ pmd = READ_ONCE(*pmdp);
+
+ if (!pmd_present(pmd))
+ continue;
+
+ if (pmd_leaf(pmd)) {
+ pmd_clear(pmdp);
+ if (is_vmemmap)
+ free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap);
+ continue;
+ }
+
+ pte_base = (pte_t *)pmd_page_vaddr(*pmdp);
+ remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap);
+ free_pte_table(pte_base, pmdp);
+ }
+}
+
+static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end,
+ bool is_vmemmap, struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ pud_t *pudp, pud;
+ pmd_t *pmd_base;
+
+ for (; addr < end; addr = next) {
+ next = pud_addr_end(addr, end);
+ pudp = pud_base + pud_index(addr);
+ pud = READ_ONCE(*pudp);
+
+ if (!pud_present(pud))
+ continue;
+
+ if (pud_leaf(pud)) {
+ if (pgtable_l4_enabled) {
+ pud_clear(pudp);
+ if (is_vmemmap)
+ free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap);
+ }
+ continue;
+ }
+
+ pmd_base = pmd_offset(pudp, 0);
+ remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap);
+
+ if (pgtable_l4_enabled)
+ free_pmd_table(pmd_base, pudp);
+ }
+}
+
+static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end,
+ bool is_vmemmap, struct vmem_altmap *altmap)
+{
+ unsigned long next;
+ p4d_t *p4dp, p4d;
+ pud_t *pud_base;
+
+ for (; addr < end; addr = next) {
+ next = p4d_addr_end(addr, end);
+ p4dp = p4d_base + p4d_index(addr);
+ p4d = READ_ONCE(*p4dp);
+
+ if (!p4d_present(p4d))
+ continue;
+
+ if (p4d_leaf(p4d)) {
+ if (pgtable_l5_enabled) {
+ p4d_clear(p4dp);
+ if (is_vmemmap)
+ free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap);
+ }
+ continue;
+ }
+
+ pud_base = pud_offset(p4dp, 0);
+ remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap);
+
+ if (pgtable_l5_enabled)
+ free_pud_table(pud_base, p4dp);
+ }
+}
+
+static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap,
+ struct vmem_altmap *altmap)
+{
+ unsigned long addr, next;
+ p4d_t *p4d_base;
+ pgd_t *pgd;
+
+ for (addr = va; addr < end; addr = next) {
+ next = pgd_addr_end(addr, end);
+ pgd = pgd_offset_k(addr);
+
+ if (!pgd_present(*pgd))
+ continue;
+
+ if (pgd_leaf(*pgd))
+ continue;
+
+ p4d_base = p4d_offset(pgd, 0);
+ remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap);
+ }
+
+ flush_tlb_all();
+}
+
+static void __meminit remove_linear_mapping(phys_addr_t start, u64 size)
+{
+ unsigned long va = (unsigned long)__va(start);
+ unsigned long end = (unsigned long)__va(start + size);
+
+ remove_pgd_mapping(va, end, false, NULL);
+}
+
+struct range arch_get_mappable_range(void)
+{
+ struct range mhp_range;
+
+ mhp_range.start = __pa(PAGE_OFFSET);
+ mhp_range.end = __pa(PAGE_END - 1);
+ return mhp_range;
+}
+
+int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
+{
+ int ret;
+
+ create_linear_mapping_range(start, start + size, 0, ¶ms->pgprot);
+ flush_tlb_all();
+ ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params);
+ if (ret) {
+ remove_linear_mapping(start, size);
+ return ret;
+ }
+
+ max_pfn = PFN_UP(start + size);
+ max_low_pfn = max_pfn;
+ return 0;
+}
+
+void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
+{
+ __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
+ remove_linear_mapping(start, size);
+}
+
+#ifdef CONFIG_SPARSEMEM_VMEMMAP
+void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
+{
+ remove_pgd_mapping(start, end, true, altmap);
+}
+#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+#endif /* CONFIG_MEMORY_HOTPLUG */
--
2.40.1
On Tue, May 14, 2024 at 04:04:42PM +0200, Björn Töpel wrote:
> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
> + struct vmem_altmap *altmap)
> +{
> + if (altmap)
> + vmem_altmap_free(altmap, size >> PAGE_SHIFT);
> + else
> + free_pages((unsigned long)page_address(page), get_order(size));
David already pointed this out, but can check
arch/x86/mm/init_64.c:free_pagetable().
You will see that we have to do some magic for bootmem memory (DIMMs
which were not hotplugged but already present)
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
> +{
> + remove_pgd_mapping(start, end, true, altmap);
> +}
> +#endif /* CONFIG_SPARSEMEM_VMEMMAP */
> +#endif /* CONFIG_MEMORY_HOTPLUG */
I will comment on the patch where you add support for hotplug and the
dependency, but on a track in LSFMM today, we decided that most likely
we will drop memory-hotplug support for !CONFIG_SPARSEMEM_VMEMMAP
environments.
So, since you are adding this plain fresh, please consider to tight the
hotplug dependency to CONFIG_SPARSEMEM_VMEMMAP.
As a bonus, you will only have to maintain one flavour of functions.
--
Oscar Salvador
SUSE Labs
Oscar Salvador <osalvador@suse.de> writes:
> On Tue, May 14, 2024 at 04:04:42PM +0200, Björn Töpel wrote:
>> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
>> + struct vmem_altmap *altmap)
>> +{
>> + if (altmap)
>> + vmem_altmap_free(altmap, size >> PAGE_SHIFT);
>> + else
>> + free_pages((unsigned long)page_address(page), get_order(size));
>
> David already pointed this out, but can check
> arch/x86/mm/init_64.c:free_pagetable().
>
> You will see that we have to do some magic for bootmem memory (DIMMs
> which were not hotplugged but already present)
Thank you!
>> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
>> +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
>> +{
>> + remove_pgd_mapping(start, end, true, altmap);
>> +}
>> +#endif /* CONFIG_SPARSEMEM_VMEMMAP */
>> +#endif /* CONFIG_MEMORY_HOTPLUG */
>
> I will comment on the patch where you add support for hotplug and the
> dependency, but on a track in LSFMM today, we decided that most likely
> we will drop memory-hotplug support for !CONFIG_SPARSEMEM_VMEMMAP
> environments.
> So, since you are adding this plain fresh, please consider to tight the
> hotplug dependency to CONFIG_SPARSEMEM_VMEMMAP.
> As a bonus, you will only have to maintain one flavour of functions.
Ah, yeah, I saw it mentioned on the LSF/MM/BPF topics. Less is
definitely more -- I'll make the next version depend on
SPARSEMEM_VMEMMAP.
Björn
On Tue, May 14, 2024 at 4:05 PM Björn Töpel <bjorn@kernel.org> wrote:
>
> From: Björn Töpel <bjorn@rivosinc.com>
>
> For an architecture to support memory hotplugging, a couple of
> callbacks needs to be implemented:
>
> arch_add_memory()
> This callback is responsible for adding the physical memory into the
> direct map, and call into the memory hotplugging generic code via
> __add_pages() that adds the corresponding struct page entries, and
> updates the vmemmap mapping.
>
> arch_remove_memory()
> This is the inverse of the callback above.
>
> vmemmap_free()
> This function tears down the vmemmap mappings (if
> CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
> backing vmemmap pages. Note that for persistent memory, an
> alternative allocator for the backing pages can be used; The
> vmem_altmap. This means that when the backing pages are cleared,
> extra care is needed so that the correct deallocation method is
> used.
>
> arch_get_mappable_range()
> This functions returns the PA range that the direct map can map.
> Used by the MHP internals for sanity checks.
>
> The page table unmap/teardown functions are heavily based on code from
> the x86 tree. The same remove_pgd_mapping() function is used in both
> vmemmap_free() and arch_remove_memory(), but in the latter function
> the backing pages are not removed.
>
> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
> ---
> arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 242 insertions(+)
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 6f72b0b2b854..7f0b921a3d3a 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void)
> }
> }
> #endif
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> + pte_t *pte;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PTE; i++) {
> + pte = pte_start + i;
> + if (!pte_none(*pte))
> + return;
> + }
> +
> + free_pages((unsigned long)page_address(pmd_page(*pmd)), 0);
> + pmd_clear(pmd);
> +}
> +
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> + pmd_t *pmd;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + pmd = pmd_start + i;
> + if (!pmd_none(*pmd))
> + return;
> + }
> +
> + free_pages((unsigned long)page_address(pud_page(*pud)), 0);
> + pud_clear(pud);
> +}
> +
> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
> +{
> + pud_t *pud;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PUD; i++) {
> + pud = pud_start + i;
> + if (!pud_none(*pud))
> + return;
> + }
> +
> + free_pages((unsigned long)page_address(p4d_page(*p4d)), 0);
> + p4d_clear(p4d);
> +}
> +
> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
> + struct vmem_altmap *altmap)
> +{
> + if (altmap)
> + vmem_altmap_free(altmap, size >> PAGE_SHIFT);
> + else
> + free_pages((unsigned long)page_address(page), get_order(size));
> +}
> +
> +static void __meminit remove_pte_mapping(pte_t *pte_base, unsigned long addr, unsigned long end,
> + bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> + unsigned long next;
> + pte_t *ptep, pte;
> +
> + for (; addr < end; addr = next) {
> + next = (addr + PAGE_SIZE) & PAGE_MASK;
> + if (next > end)
> + next = end;
> +
> + ptep = pte_base + pte_index(addr);
> + pte = READ_ONCE(*ptep);
> +
> + if (!pte_present(*ptep))
> + continue;
> +
> + pte_clear(&init_mm, addr, ptep);
> + if (is_vmemmap)
> + free_vmemmap_storage(pte_page(pte), PAGE_SIZE, altmap);
> + }
> +}
> +
> +static void __meminit remove_pmd_mapping(pmd_t *pmd_base, unsigned long addr, unsigned long end,
> + bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> + unsigned long next;
> + pte_t *pte_base;
> + pmd_t *pmdp, pmd;
> +
> + for (; addr < end; addr = next) {
> + next = pmd_addr_end(addr, end);
> + pmdp = pmd_base + pmd_index(addr);
> + pmd = READ_ONCE(*pmdp);
> +
> + if (!pmd_present(pmd))
> + continue;
> +
> + if (pmd_leaf(pmd)) {
> + pmd_clear(pmdp);
> + if (is_vmemmap)
> + free_vmemmap_storage(pmd_page(pmd), PMD_SIZE, altmap);
> + continue;
> + }
> +
> + pte_base = (pte_t *)pmd_page_vaddr(*pmdp);
> + remove_pte_mapping(pte_base, addr, next, is_vmemmap, altmap);
> + free_pte_table(pte_base, pmdp);
> + }
> +}
> +
> +static void __meminit remove_pud_mapping(pud_t *pud_base, unsigned long addr, unsigned long end,
> + bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> + unsigned long next;
> + pud_t *pudp, pud;
> + pmd_t *pmd_base;
> +
> + for (; addr < end; addr = next) {
> + next = pud_addr_end(addr, end);
> + pudp = pud_base + pud_index(addr);
> + pud = READ_ONCE(*pudp);
> +
> + if (!pud_present(pud))
> + continue;
> +
> + if (pud_leaf(pud)) {
> + if (pgtable_l4_enabled) {
> + pud_clear(pudp);
> + if (is_vmemmap)
> + free_vmemmap_storage(pud_page(pud), PUD_SIZE, altmap);
> + }
> + continue;
> + }
> +
> + pmd_base = pmd_offset(pudp, 0);
> + remove_pmd_mapping(pmd_base, addr, next, is_vmemmap, altmap);
> +
> + if (pgtable_l4_enabled)
> + free_pmd_table(pmd_base, pudp);
> + }
> +}
> +
> +static void __meminit remove_p4d_mapping(p4d_t *p4d_base, unsigned long addr, unsigned long end,
> + bool is_vmemmap, struct vmem_altmap *altmap)
> +{
> + unsigned long next;
> + p4d_t *p4dp, p4d;
> + pud_t *pud_base;
> +
> + for (; addr < end; addr = next) {
> + next = p4d_addr_end(addr, end);
> + p4dp = p4d_base + p4d_index(addr);
> + p4d = READ_ONCE(*p4dp);
> +
> + if (!p4d_present(p4d))
> + continue;
> +
> + if (p4d_leaf(p4d)) {
> + if (pgtable_l5_enabled) {
> + p4d_clear(p4dp);
> + if (is_vmemmap)
> + free_vmemmap_storage(p4d_page(p4d), P4D_SIZE, altmap);
> + }
> + continue;
> + }
> +
> + pud_base = pud_offset(p4dp, 0);
> + remove_pud_mapping(pud_base, addr, next, is_vmemmap, altmap);
> +
> + if (pgtable_l5_enabled)
> + free_pud_table(pud_base, p4dp);
> + }
> +}
> +
> +static void __meminit remove_pgd_mapping(unsigned long va, unsigned long end, bool is_vmemmap,
> + struct vmem_altmap *altmap)
> +{
> + unsigned long addr, next;
> + p4d_t *p4d_base;
> + pgd_t *pgd;
> +
> + for (addr = va; addr < end; addr = next) {
> + next = pgd_addr_end(addr, end);
> + pgd = pgd_offset_k(addr);
> +
> + if (!pgd_present(*pgd))
> + continue;
> +
> + if (pgd_leaf(*pgd))
> + continue;
> +
> + p4d_base = p4d_offset(pgd, 0);
> + remove_p4d_mapping(p4d_base, addr, next, is_vmemmap, altmap);
> + }
> +
> + flush_tlb_all();
> +}
> +
> +static void __meminit remove_linear_mapping(phys_addr_t start, u64 size)
> +{
> + unsigned long va = (unsigned long)__va(start);
> + unsigned long end = (unsigned long)__va(start + size);
> +
> + remove_pgd_mapping(va, end, false, NULL);
> +}
> +
> +struct range arch_get_mappable_range(void)
> +{
> + struct range mhp_range;
> +
> + mhp_range.start = __pa(PAGE_OFFSET);
> + mhp_range.end = __pa(PAGE_END - 1);
> + return mhp_range;
> +}
> +
> +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
> +{
> + int ret;
> +
> + create_linear_mapping_range(start, start + size, 0, ¶ms->pgprot);
> + flush_tlb_all();
> + ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params);
> + if (ret) {
> + remove_linear_mapping(start, size);
> + return ret;
> + }
> +
You need to flush the TLB here too since __add_pages() populates the
page table with the new vmemmap mapping (only because riscv allows to
cache invalid entries, I'll adapt this in my next version of Svvptc
support).
> + max_pfn = PFN_UP(start + size);
> + max_low_pfn = max_pfn;
> + return 0;
> +}
> +
> +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
> +{
> + __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
> + remove_linear_mapping(start, size);
You need to flush the TLB here too.
> +}
> +
> +#ifdef CONFIG_SPARSEMEM_VMEMMAP
> +void __ref vmemmap_free(unsigned long start, unsigned long end, struct vmem_altmap *altmap)
> +{
> + remove_pgd_mapping(start, end, true, altmap);
> +}
> +#endif /* CONFIG_SPARSEMEM_VMEMMAP */
> +#endif /* CONFIG_MEMORY_HOTPLUG */
> --
> 2.40.1
>
Alexandre Ghiti <alexghiti@rivosinc.com> writes:
> On Tue, May 14, 2024 at 4:05 PM Björn Töpel <bjorn@kernel.org> wrote:
>> +int __ref arch_add_memory(int nid, u64 start, u64 size, struct mhp_params *params)
>> +{
>> + int ret;
>> +
>> + create_linear_mapping_range(start, start + size, 0, ¶ms->pgprot);
>> + flush_tlb_all();
>> + ret = __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT, params);
>> + if (ret) {
>> + remove_linear_mapping(start, size);
>> + return ret;
>> + }
>> +
>
> You need to flush the TLB here too since __add_pages() populates the
> page table with the new vmemmap mapping (only because riscv allows to
> cache invalid entries, I'll adapt this in my next version of Svvptc
> support).
>
>> + max_pfn = PFN_UP(start + size);
>> + max_low_pfn = max_pfn;
>> + return 0;
>> +}
>> +
>> +void __ref arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap)
>> +{
>> + __remove_pages(start >> PAGE_SHIFT, size >> PAGE_SHIFT, altmap);
>> + remove_linear_mapping(start, size);
>
> You need to flush the TLB here too.
I'll address all of the above in the next version. Thanks for reviewing
the series!
Björn
On 14.05.24 16:04, Björn Töpel wrote:
> From: Björn Töpel <bjorn@rivosinc.com>
>
> For an architecture to support memory hotplugging, a couple of
> callbacks needs to be implemented:
>
> arch_add_memory()
> This callback is responsible for adding the physical memory into the
> direct map, and call into the memory hotplugging generic code via
> __add_pages() that adds the corresponding struct page entries, and
> updates the vmemmap mapping.
>
> arch_remove_memory()
> This is the inverse of the callback above.
>
> vmemmap_free()
> This function tears down the vmemmap mappings (if
> CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
> backing vmemmap pages. Note that for persistent memory, an
> alternative allocator for the backing pages can be used; The
> vmem_altmap. This means that when the backing pages are cleared,
> extra care is needed so that the correct deallocation method is
> used.
>
> arch_get_mappable_range()
> This functions returns the PA range that the direct map can map.
> Used by the MHP internals for sanity checks.
>
> The page table unmap/teardown functions are heavily based on code from
> the x86 tree. The same remove_pgd_mapping() function is used in both
> vmemmap_free() and arch_remove_memory(), but in the latter function
> the backing pages are not removed.
>
> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
> ---
> arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 242 insertions(+)
>
> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
> index 6f72b0b2b854..7f0b921a3d3a 100644
> --- a/arch/riscv/mm/init.c
> +++ b/arch/riscv/mm/init.c
> @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void)
> }
> }
> #endif
> +
> +#ifdef CONFIG_MEMORY_HOTPLUG
> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
> +{
> + pte_t *pte;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PTE; i++) {
> + pte = pte_start + i;
> + if (!pte_none(*pte))
> + return;
> + }
> +
> + free_pages((unsigned long)page_address(pmd_page(*pmd)), 0);
> + pmd_clear(pmd);
> +}
> +
> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
> +{
> + pmd_t *pmd;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PMD; i++) {
> + pmd = pmd_start + i;
> + if (!pmd_none(*pmd))
> + return;
> + }
> +
> + free_pages((unsigned long)page_address(pud_page(*pud)), 0);
> + pud_clear(pud);
> +}
> +
> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
> +{
> + pud_t *pud;
> + int i;
> +
> + for (i = 0; i < PTRS_PER_PUD; i++) {
> + pud = pud_start + i;
> + if (!pud_none(*pud))
> + return;
> + }
> +
> + free_pages((unsigned long)page_address(p4d_page(*p4d)), 0);
> + p4d_clear(p4d);
> +}
> +
> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
> + struct vmem_altmap *altmap)
> +{
> + if (altmap)
> + vmem_altmap_free(altmap, size >> PAGE_SHIFT);
> + else
> + free_pages((unsigned long)page_address(page), get_order(size));
If you unplug a DIMM that was added during boot (can happen on x86-64,
can it happen on riscv?), free_pages() would not be sufficient. You'd be
freeing a PG_reserved page that has to be freed differently.
--
Cheers,
David / dhildenb
David Hildenbrand <david@redhat.com> writes:
> On 14.05.24 16:04, Björn Töpel wrote:
>> From: Björn Töpel <bjorn@rivosinc.com>
>>
>> For an architecture to support memory hotplugging, a couple of
>> callbacks needs to be implemented:
>>
>> arch_add_memory()
>> This callback is responsible for adding the physical memory into the
>> direct map, and call into the memory hotplugging generic code via
>> __add_pages() that adds the corresponding struct page entries, and
>> updates the vmemmap mapping.
>>
>> arch_remove_memory()
>> This is the inverse of the callback above.
>>
>> vmemmap_free()
>> This function tears down the vmemmap mappings (if
>> CONFIG_SPARSEMEM_VMEMMAP is enabled), and also deallocates the
>> backing vmemmap pages. Note that for persistent memory, an
>> alternative allocator for the backing pages can be used; The
>> vmem_altmap. This means that when the backing pages are cleared,
>> extra care is needed so that the correct deallocation method is
>> used.
>>
>> arch_get_mappable_range()
>> This functions returns the PA range that the direct map can map.
>> Used by the MHP internals for sanity checks.
>>
>> The page table unmap/teardown functions are heavily based on code from
>> the x86 tree. The same remove_pgd_mapping() function is used in both
>> vmemmap_free() and arch_remove_memory(), but in the latter function
>> the backing pages are not removed.
>>
>> Signed-off-by: Björn Töpel <bjorn@rivosinc.com>
>> ---
>> arch/riscv/mm/init.c | 242 +++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 242 insertions(+)
>>
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index 6f72b0b2b854..7f0b921a3d3a 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -1493,3 +1493,245 @@ void __init pgtable_cache_init(void)
>> }
>> }
>> #endif
>> +
>> +#ifdef CONFIG_MEMORY_HOTPLUG
>> +static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd)
>> +{
>> + pte_t *pte;
>> + int i;
>> +
>> + for (i = 0; i < PTRS_PER_PTE; i++) {
>> + pte = pte_start + i;
>> + if (!pte_none(*pte))
>> + return;
>> + }
>> +
>> + free_pages((unsigned long)page_address(pmd_page(*pmd)), 0);
>> + pmd_clear(pmd);
>> +}
>> +
>> +static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud)
>> +{
>> + pmd_t *pmd;
>> + int i;
>> +
>> + for (i = 0; i < PTRS_PER_PMD; i++) {
>> + pmd = pmd_start + i;
>> + if (!pmd_none(*pmd))
>> + return;
>> + }
>> +
>> + free_pages((unsigned long)page_address(pud_page(*pud)), 0);
>> + pud_clear(pud);
>> +}
>> +
>> +static void __meminit free_pud_table(pud_t *pud_start, p4d_t *p4d)
>> +{
>> + pud_t *pud;
>> + int i;
>> +
>> + for (i = 0; i < PTRS_PER_PUD; i++) {
>> + pud = pud_start + i;
>> + if (!pud_none(*pud))
>> + return;
>> + }
>> +
>> + free_pages((unsigned long)page_address(p4d_page(*p4d)), 0);
>> + p4d_clear(p4d);
>> +}
>> +
>> +static void __meminit free_vmemmap_storage(struct page *page, size_t size,
>> + struct vmem_altmap *altmap)
>> +{
>> + if (altmap)
>> + vmem_altmap_free(altmap, size >> PAGE_SHIFT);
>> + else
>> + free_pages((unsigned long)page_address(page), get_order(size));
>
> If you unplug a DIMM that was added during boot (can happen on x86-64,
> can it happen on riscv?), free_pages() would not be sufficient. You'd be
> freeing a PG_reserved page that has to be freed differently.
I'd say if it can happen on x86-64, it probably can on RISC-V. I'll look
into this for the next spin!
Thanks for spending time on the series!
Cheers,
Björn
© 2016 - 2026 Red Hat, Inc.