[v5] arm64: mm: show direct mapping use in /proc/meminfo

[v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 1 month ago

Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
rodata=full"), the direct mapping may be split on some machines instead
keeping static since boot. It makes more sense to show the direct mapping
use in /proc/meminfo than before.
This patch will make /proc/meminfo show the direct mapping use like the
below (4K base page size):
DirectMap4K:	   94792 kB
DirectMap64K:	  134208 kB
DirectMap2M:	 1173504 kB
DirectMap32M:	 5636096 kB
DirectMap1G:	529530880 kB

Although just the machines which support BBML2_NOABORT can split the
direct mapping, show it on all machines regardless of BBML2_NOABORT so
that the users have consistent view in order to avoid confusion.

Although ptdump also can tell the direct map use, but it needs to dump
the whole kernel page table. It is costly and overkilling. It is also
in debugfs which may not be enabled by all distros. So showing direct
map use in /proc/meminfo seems more convenient and has less overhead.

Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
---
v5: * Rebased to v6.19-rc4
    * Fixed the build error for !CONFIG_PROC_FS
v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
    * Used shorter name for the helpers and variables per Ryan
    * Fixed accounting for memory hotunplug
v3: * Fixed the over-accounting problems per Ryan
    * Introduced helpers for add/sub direct map use and #ifdef them with
      CONFIG_PROC_FS per Ryan
    * v3 is a fix patch on top of v2
v2: * Counted in size instead of the number of entries per Ryan
    * Removed shift array per Ryan
    * Use lower case "k" per Ryan
    * Fixed a couple of build warnings reported by kernel test robot
    * Fixed a couple of poential miscounts

 arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 181 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8e1d80a7033e..422441c9a992 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -29,6 +29,7 @@
 #include <linux/mm_inline.h>
 #include <linux/pagewalk.h>
 #include <linux/stop_machine.h>
+#include <linux/proc_fs.h>
 
 #include <asm/barrier.h>
 #include <asm/cputype.h>
@@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
 	dsb(ishst);
 }
 
+enum dm_type {
+	PTE,
+	CONT_PTE,
+	PMD,
+	CONT_PMD,
+	PUD,
+	NR_DM_TYPE,
+};
+
+#ifdef CONFIG_PROC_FS
+static unsigned long dm_meminfo[NR_DM_TYPE];
+
+void arch_report_meminfo(struct seq_file *m)
+{
+	char *size[NR_DM_TYPE];
+
+#if defined(CONFIG_ARM64_4K_PAGES)
+	size[PTE] = "4k";
+	size[CONT_PTE] = "64k";
+	size[PMD] = "2M";
+	size[CONT_PMD] = "32M";
+	size[PUD] = "1G";
+#elif defined(CONFIG_ARM64_16K_PAGES)
+	size[PTE] = "16k";
+	size[CONT_PTE] = "2M";
+	size[PMD] = "32M";
+	size[CONT_PMD] = "1G";
+#elif defined(CONFIG_ARM64_64K_PAGES)
+	size[PTE] = "64k";
+	size[CONT_PTE] = "2M";
+	size[PMD] = "512M";
+	size[CONT_PMD] = "16G";
+#endif
+
+	seq_printf(m, "DirectMap%s:	%8lu kB\n",
+			size[PTE], dm_meminfo[PTE] >> 10);
+	seq_printf(m, "DirectMap%s:	%8lu kB\n",
+			size[CONT_PTE],
+			dm_meminfo[CONT_PTE] >> 10);
+	seq_printf(m, "DirectMap%s:	%8lu kB\n",
+			size[PMD], dm_meminfo[PMD] >> 10);
+	seq_printf(m, "DirectMap%s:	%8lu kB\n",
+			size[CONT_PMD],
+			dm_meminfo[CONT_PMD] >> 10);
+	if (pud_sect_supported())
+		seq_printf(m, "DirectMap%s:	%8lu kB\n",
+			size[PUD], dm_meminfo[PUD] >> 10);
+}
+
+static inline bool is_dm_addr(unsigned long addr)
+{
+	return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
+}
+
+static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
+				  enum dm_type type)
+{
+	if (is_dm_addr(addr))
+		dm_meminfo[type] += size;
+}
+
+static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
+				  enum dm_type type)
+{
+	if (is_dm_addr(addr))
+		dm_meminfo[type] -= size;
+}
+#else
+static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
+				  enum dm_type type)
+{
+}
+
+static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
+				  enum dm_type type)
+{
+}
+#endif
+
 static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
 		     phys_addr_t phys, pgprot_t prot)
 {
@@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
 
 		init_pte(ptep, addr, next, phys, __prot);
 
+		if (pgprot_val(__prot) & PTE_CONT)
+			dm_meminfo_add(addr, (next - addr), CONT_PTE);
+		else
+			dm_meminfo_add(addr, (next - addr), PTE);
+
 		ptep += pte_index(next) - pte_index(addr);
 		phys += next - addr;
 	} while (addr = next, addr != end);
@@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		    (flags & NO_BLOCK_MAPPINGS) == 0) {
 			pmd_set_huge(pmdp, phys, prot);
 
+			/*
+			 * It is possible to have mappings allow cont mapping
+			 * but disallow block mapping. For example,
+			 * map_entry_trampoline().
+			 * So we have to increase CONT_PMD and PMD size here
+			 * to avoid double counting.
+			 */
+			if (pgprot_val(prot) & PTE_CONT)
+				dm_meminfo_add(addr, (next - addr), CONT_PMD);
+			else
+				dm_meminfo_add(addr, (next - addr), PMD);
 			/*
 			 * After the PMD entry has been populated once, we
 			 * only allow updates to the permission attributes.
@@ -389,6 +485,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 		    (flags & NO_BLOCK_MAPPINGS) == 0) {
 			pud_set_huge(pudp, phys, prot);
 
+			dm_meminfo_add(addr, (next - addr), PUD);
 			/*
 			 * After the PUD entry has been populated once, we
 			 * only allow updates to the permission attributes.
@@ -575,16 +672,21 @@ pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
 	return  __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
 }
 
-static void split_contpte(pte_t *ptep)
+static void split_contpte(unsigned long addr, pte_t *ptep)
 {
 	int i;
 
+	dm_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
+
 	ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
 	for (i = 0; i < CONT_PTES; i++, ptep++)
 		__set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
+
+	dm_meminfo_add(addr, CONT_PTE_SIZE, PTE);
 }
 
-static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
+static int split_pmd(unsigned long addr, pmd_t *pmdp, pmd_t pmd, gfp_t gfp,
+		     bool to_cont)
 {
 	pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
 	unsigned long pfn = pmd_pfn(pmd);
@@ -606,8 +708,13 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
 	if (to_cont)
 		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
+	dm_meminfo_sub(addr, PMD_SIZE, PMD);
 	for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
 		__set_pte(ptep, pfn_pte(pfn, prot));
+	if (to_cont)
+		dm_meminfo_add(addr, PMD_SIZE, CONT_PTE);
+	else
+		dm_meminfo_add(addr, PMD_SIZE, PTE);
 
 	/*
 	 * Ensure the pte entries are visible to the table walker by the time
@@ -619,16 +726,21 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
 	return 0;
 }
 
-static void split_contpmd(pmd_t *pmdp)
+static void split_contpmd(unsigned long addr, pmd_t *pmdp)
 {
 	int i;
 
+	dm_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
+
 	pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
 	for (i = 0; i < CONT_PMDS; i++, pmdp++)
 		set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
+
+	dm_meminfo_add(addr, CONT_PMD_SIZE, PMD);
 }
 
-static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
+static int split_pud(unsigned long addr, pud_t *pudp, pud_t pud, gfp_t gfp,
+		     bool to_cont)
 {
 	pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
 	unsigned int step = PMD_SIZE >> PAGE_SHIFT;
@@ -651,8 +763,13 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
 	if (to_cont)
 		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
 
+	dm_meminfo_sub(addr, PUD_SIZE, PUD);
 	for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step)
 		set_pmd(pmdp, pfn_pmd(pfn, prot));
+	if (to_cont)
+		dm_meminfo_add(addr, PUD_SIZE, CONT_PMD);
+	else
+		dm_meminfo_add(addr, PUD_SIZE, PMD);
 
 	/*
 	 * Ensure the pmd entries are visible to the table walker by the time
@@ -707,7 +824,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
 	if (!pud_present(pud))
 		goto out;
 	if (pud_leaf(pud)) {
-		ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
+		ret = split_pud(addr, pudp, pud, GFP_PGTABLE_KERNEL, true);
 		if (ret)
 			goto out;
 	}
@@ -725,14 +842,14 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
 		goto out;
 	if (pmd_leaf(pmd)) {
 		if (pmd_cont(pmd))
-			split_contpmd(pmdp);
+			split_contpmd(addr, pmdp);
 		/*
 		 * PMD: If addr is PMD aligned then addr already describes a
 		 * leaf boundary. Otherwise, split to contpte.
 		 */
 		if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
 			goto out;
-		ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
+		ret = split_pmd(addr, pmdp, pmd, GFP_PGTABLE_KERNEL, true);
 		if (ret)
 			goto out;
 	}
@@ -749,7 +866,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
 	if (!pte_present(pte))
 		goto out;
 	if (pte_cont(pte))
-		split_contpte(ptep);
+		split_contpte(addr, ptep);
 
 out:
 	return ret;
@@ -835,7 +952,7 @@ static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
 	int ret = 0;
 
 	if (pud_leaf(pud))
-		ret = split_pud(pudp, pud, gfp, false);
+		ret = split_pud(addr, pudp, pud, gfp, false);
 
 	return ret;
 }
@@ -849,8 +966,8 @@ static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
 
 	if (pmd_leaf(pmd)) {
 		if (pmd_cont(pmd))
-			split_contpmd(pmdp);
-		ret = split_pmd(pmdp, pmd, gfp, false);
+			split_contpmd(addr, pmdp);
+		ret = split_pmd(addr, pmdp, pmd, gfp, false);
 
 		/*
 		 * We have split the pmd directly to ptes so there is no need to
@@ -868,7 +985,7 @@ static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
 	pte_t pte = __ptep_get(ptep);
 
 	if (pte_cont(pte))
-		split_contpte(ptep);
+		split_contpte(addr, ptep);
 
 	return 0;
 }
@@ -1444,37 +1561,57 @@ static bool pgtable_range_aligned(unsigned long start, unsigned long end,
 	return true;
 }
 
-static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
+static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
 				    unsigned long end, bool free_mapped,
 				    struct vmem_altmap *altmap)
 {
-	pte_t *ptep, pte;
+	pte_t pte;
 
 	do {
-		ptep = pte_offset_kernel(pmdp, addr);
 		pte = __ptep_get(ptep);
 		if (pte_none(pte))
 			continue;
 
 		WARN_ON(!pte_present(pte));
 		__pte_clear(&init_mm, addr, ptep);
+		dm_meminfo_sub(addr, PAGE_SIZE, PTE);
 		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
 		if (free_mapped)
 			free_hotplug_page_range(pte_page(pte),
 						PAGE_SIZE, altmap);
-	} while (addr += PAGE_SIZE, addr < end);
+	} while (ptep++, addr += PAGE_SIZE, addr < end);
+}
+
+static void unmap_hotplug_cont_pte_range(pmd_t *pmdp, unsigned long addr,
+					 unsigned long end, bool free_mapped,
+					 struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	pte_t *ptep, pte;
+
+	do {
+		next = pte_cont_addr_end(addr, end);
+		ptep = pte_offset_kernel(pmdp, addr);
+		pte = __ptep_get(ptep);
+
+		if (pte_present(pte) && pte_cont(pte)) {
+			dm_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
+			dm_meminfo_add(addr, CONT_PTE_SIZE, PTE);
+		}
+
+		unmap_hotplug_pte_range(ptep, addr, next, free_mapped, altmap);
+	} while (addr = next, addr < end);
 }
 
-static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
+static void unmap_hotplug_pmd_range(pmd_t *pmdp, unsigned long addr,
 				    unsigned long end, bool free_mapped,
 				    struct vmem_altmap *altmap)
 {
 	unsigned long next;
-	pmd_t *pmdp, pmd;
+	pmd_t pmd;
 
 	do {
 		next = pmd_addr_end(addr, end);
-		pmdp = pmd_offset(pudp, addr);
 		pmd = READ_ONCE(*pmdp);
 		if (pmd_none(pmd))
 			continue;
@@ -1482,6 +1619,7 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
 		WARN_ON(!pmd_present(pmd));
 		if (pmd_sect(pmd)) {
 			pmd_clear(pmdp);
+			dm_meminfo_sub(addr, PMD_SIZE, PMD);
 
 			/*
 			 * One TLBI should be sufficient here as the PMD_SIZE
@@ -1494,7 +1632,28 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
 			continue;
 		}
 		WARN_ON(!pmd_table(pmd));
-		unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
+		unmap_hotplug_cont_pte_range(pmdp, addr, next, free_mapped, altmap);
+	} while (pmdp++, addr = next, addr < end);
+}
+
+static void unmap_hotplug_cont_pmd_range(pud_t *pudp, unsigned long addr,
+					 unsigned long end, bool free_mapped,
+					 struct vmem_altmap *altmap)
+{
+	unsigned long next;
+	pmd_t *pmdp, pmd;
+
+	do {
+		next = pmd_cont_addr_end(addr, end);
+		pmdp = pmd_offset(pudp, addr);
+		pmd = READ_ONCE(*pmdp);
+
+		if (pmd_leaf(pmd) && pmd_cont(pmd)) {
+			dm_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
+			dm_meminfo_add(addr, CONT_PMD_SIZE, PMD);
+		}
+
+		unmap_hotplug_pmd_range(pmdp, addr, next, free_mapped, altmap);
 	} while (addr = next, addr < end);
 }
 
@@ -1515,6 +1674,7 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
 		WARN_ON(!pud_present(pud));
 		if (pud_sect(pud)) {
 			pud_clear(pudp);
+			dm_meminfo_sub(addr, PUD_SIZE, PUD);
 
 			/*
 			 * One TLBI should be sufficient here as the PUD_SIZE
@@ -1527,7 +1687,7 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
 			continue;
 		}
 		WARN_ON(!pud_table(pud));
-		unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
+		unmap_hotplug_cont_pmd_range(pudp, addr, next, free_mapped, altmap);
 	} while (addr = next, addr < end);
 }
 
-- 
2.47.0

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Anshuman Khandual 2 weeks, 2 days ago

Hello Yang,

On 07/01/26 5:59 AM, Yang Shi wrote:
> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
> rodata=full"), the direct mapping may be split on some machines instead
> keeping static since boot. It makes more sense to show the direct mapping
> use in /proc/meminfo than before.

I guess the direct mapping here refers to linear map ? IIUC it is called
direct map on x86 and linear map on arm64 platforms. Then should not it
be renamed as s/DirectMap/LinearMap instead ? This will align with names
from ptdump as well.

Before the above mentioned commit, linear could get altered with memory
hotplug and remove events as well.

> This patch will make /proc/meminfo show the direct mapping use like the
> below (4K base page size):
> DirectMap4K:	   94792 kB
> DirectMap64K:	  134208 kB
> DirectMap2M:	 1173504 kB
> DirectMap32M:	 5636096 kB
> DirectMap1G:	529530880 kB

If /proc/meminfo interface is getting updated via  arch_report_meminfo()
why not add stats for all kernel virtual address space ranges including
vmemmap, vmalloc etc aka all address range headers in ptdump as many of
those could change during system runtime. What makes linear mapping any
special ?

> 
> Although just the machines which support BBML2_NOABORT can split the
> direct mapping, show it on all machines regardless of BBML2_NOABORT so
> that the users have consistent view in order to avoid confusion.
> 
> Although ptdump also can tell the direct map use, but it needs to dump
> the whole kernel page table. It is costly and overkilling. It is also
> in debugfs which may not be enabled by all distros. So showing direct
> map use in /proc/meminfo seems more convenient and has less overhead.

Agreed a /proc/meminfo based broader kernel virtual address space stats
display will complement ptdump which provides more granular information
about their mapping (with additional cost and setup) but it should cover
all the regions in kernel virtual space.

> 
> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
> ---
> v5: * Rebased to v6.19-rc4
>     * Fixed the build error for !CONFIG_PROC_FS
> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>     * Used shorter name for the helpers and variables per Ryan
>     * Fixed accounting for memory hotunplug
> v3: * Fixed the over-accounting problems per Ryan
>     * Introduced helpers for add/sub direct map use and #ifdef them with
>       CONFIG_PROC_FS per Ryan
>     * v3 is a fix patch on top of v2
> v2: * Counted in size instead of the number of entries per Ryan
>     * Removed shift array per Ryan
>     * Use lower case "k" per Ryan
>     * Fixed a couple of build warnings reported by kernel test robot
>     * Fixed a couple of poential miscounts
> 
>  arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 181 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 8e1d80a7033e..422441c9a992 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -29,6 +29,7 @@
>  #include <linux/mm_inline.h>
>  #include <linux/pagewalk.h>
>  #include <linux/stop_machine.h>
> +#include <linux/proc_fs.h>
>  
>  #include <asm/barrier.h>
>  #include <asm/cputype.h>
> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>  	dsb(ishst);
>  }
>  
> +enum dm_type {
> +	PTE,
> +	CONT_PTE,
> +	PMD,
> +	CONT_PMD,
> +	PUD,
> +	NR_DM_TYPE,
> +};
> +
> +#ifdef CONFIG_PROC_FS
> +static unsigned long dm_meminfo[NR_DM_TYPE];
> +
> +void arch_report_meminfo(struct seq_file *m)
> +{
> +	char *size[NR_DM_TYPE];
> +
> +#if defined(CONFIG_ARM64_4K_PAGES)
> +	size[PTE] = "4k";
> +	size[CONT_PTE] = "64k";
> +	size[PMD] = "2M";
> +	size[CONT_PMD] = "32M";
> +	size[PUD] = "1G";
> +#elif defined(CONFIG_ARM64_16K_PAGES)
> +	size[PTE] = "16k";
> +	size[CONT_PTE] = "2M";
> +	size[PMD] = "32M";
> +	size[CONT_PMD] = "1G";
> +#elif defined(CONFIG_ARM64_64K_PAGES)
> +	size[PTE] = "64k";
> +	size[CONT_PTE] = "2M";
> +	size[PMD] = "512M";
> +	size[CONT_PMD] = "16G";
> +#endif
> +
> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> +			size[PTE], dm_meminfo[PTE] >> 10);
> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> +			size[CONT_PTE],
> +			dm_meminfo[CONT_PTE] >> 10);
> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> +			size[PMD], dm_meminfo[PMD] >> 10);
> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> +			size[CONT_PMD],
> +			dm_meminfo[CONT_PMD] >> 10);
> +	if (pud_sect_supported())
> +		seq_printf(m, "DirectMap%s:	%8lu kB\n",
> +			size[PUD], dm_meminfo[PUD] >> 10);
> +}
> +
> +static inline bool is_dm_addr(unsigned long addr)
> +{
> +	return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
> +}
> +
> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
> +				  enum dm_type type)
> +{
> +	if (is_dm_addr(addr))
> +		dm_meminfo[type] += size;
> +}
> +
> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
> +				  enum dm_type type)
> +{
> +	if (is_dm_addr(addr))
> +		dm_meminfo[type] -= size;
> +}
> +#else
> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
> +				  enum dm_type type)
> +{
> +}
> +
> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
> +				  enum dm_type type)
> +{
> +}
> +#endif
> +
>  static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>  		     phys_addr_t phys, pgprot_t prot)
>  {
> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>  
>  		init_pte(ptep, addr, next, phys, __prot);
>  
> +		if (pgprot_val(__prot) & PTE_CONT)
> +			dm_meminfo_add(addr, (next - addr), CONT_PTE);
> +		else
> +			dm_meminfo_add(addr, (next - addr), PTE);
> +
>  		ptep += pte_index(next) - pte_index(addr);
>  		phys += next - addr;
>  	} while (addr = next, addr != end);
> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>  		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>  			pmd_set_huge(pmdp, phys, prot);
>  
> +			/*
> +			 * It is possible to have mappings allow cont mapping
> +			 * but disallow block mapping. For example,
> +			 * map_entry_trampoline().
> +			 * So we have to increase CONT_PMD and PMD size here
> +			 * to avoid double counting.
> +			 */
> +			if (pgprot_val(prot) & PTE_CONT)
> +				dm_meminfo_add(addr, (next - addr), CONT_PMD);
> +			else
> +				dm_meminfo_add(addr, (next - addr), PMD);
>  			/*
>  			 * After the PMD entry has been populated once, we
>  			 * only allow updates to the permission attributes.
> @@ -389,6 +485,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>  		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>  			pud_set_huge(pudp, phys, prot);
>  
> +			dm_meminfo_add(addr, (next - addr), PUD);
>  			/*
>  			 * After the PUD entry has been populated once, we
>  			 * only allow updates to the permission attributes.
> @@ -575,16 +672,21 @@ pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
>  	return  __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
>  }
>  
> -static void split_contpte(pte_t *ptep)
> +static void split_contpte(unsigned long addr, pte_t *ptep)
>  {
>  	int i;
>  
> +	dm_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
> +
>  	ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
>  	for (i = 0; i < CONT_PTES; i++, ptep++)
>  		__set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
> +
> +	dm_meminfo_add(addr, CONT_PTE_SIZE, PTE);
>  }
>  
> -static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
> +static int split_pmd(unsigned long addr, pmd_t *pmdp, pmd_t pmd, gfp_t gfp,
> +		     bool to_cont)
>  {
>  	pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
>  	unsigned long pfn = pmd_pfn(pmd);
> @@ -606,8 +708,13 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>  	if (to_cont)
>  		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>  
> +	dm_meminfo_sub(addr, PMD_SIZE, PMD);
>  	for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
>  		__set_pte(ptep, pfn_pte(pfn, prot));
> +	if (to_cont)
> +		dm_meminfo_add(addr, PMD_SIZE, CONT_PTE);
> +	else
> +		dm_meminfo_add(addr, PMD_SIZE, PTE);
>  
>  	/*
>  	 * Ensure the pte entries are visible to the table walker by the time
> @@ -619,16 +726,21 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>  	return 0;
>  }
>  
> -static void split_contpmd(pmd_t *pmdp)
> +static void split_contpmd(unsigned long addr, pmd_t *pmdp)
>  {
>  	int i;
>  
> +	dm_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
> +
>  	pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
>  	for (i = 0; i < CONT_PMDS; i++, pmdp++)
>  		set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
> +
> +	dm_meminfo_add(addr, CONT_PMD_SIZE, PMD);
>  }
>  
> -static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
> +static int split_pud(unsigned long addr, pud_t *pudp, pud_t pud, gfp_t gfp,
> +		     bool to_cont)
>  {
>  	pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
>  	unsigned int step = PMD_SIZE >> PAGE_SHIFT;
> @@ -651,8 +763,13 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
>  	if (to_cont)
>  		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>  
> +	dm_meminfo_sub(addr, PUD_SIZE, PUD);
>  	for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step)
>  		set_pmd(pmdp, pfn_pmd(pfn, prot));
> +	if (to_cont)
> +		dm_meminfo_add(addr, PUD_SIZE, CONT_PMD);
> +	else
> +		dm_meminfo_add(addr, PUD_SIZE, PMD);
>  
>  	/*
>  	 * Ensure the pmd entries are visible to the table walker by the time
> @@ -707,7 +824,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>  	if (!pud_present(pud))
>  		goto out;
>  	if (pud_leaf(pud)) {
> -		ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
> +		ret = split_pud(addr, pudp, pud, GFP_PGTABLE_KERNEL, true);
>  		if (ret)
>  			goto out;
>  	}
> @@ -725,14 +842,14 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>  		goto out;
>  	if (pmd_leaf(pmd)) {
>  		if (pmd_cont(pmd))
> -			split_contpmd(pmdp);
> +			split_contpmd(addr, pmdp);
>  		/*
>  		 * PMD: If addr is PMD aligned then addr already describes a
>  		 * leaf boundary. Otherwise, split to contpte.
>  		 */
>  		if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
>  			goto out;
> -		ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
> +		ret = split_pmd(addr, pmdp, pmd, GFP_PGTABLE_KERNEL, true);
>  		if (ret)
>  			goto out;
>  	}
> @@ -749,7 +866,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>  	if (!pte_present(pte))
>  		goto out;
>  	if (pte_cont(pte))
> -		split_contpte(ptep);
> +		split_contpte(addr, ptep);
>  
>  out:
>  	return ret;
> @@ -835,7 +952,7 @@ static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
>  	int ret = 0;
>  
>  	if (pud_leaf(pud))
> -		ret = split_pud(pudp, pud, gfp, false);
> +		ret = split_pud(addr, pudp, pud, gfp, false);
>  
>  	return ret;
>  }
> @@ -849,8 +966,8 @@ static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
>  
>  	if (pmd_leaf(pmd)) {
>  		if (pmd_cont(pmd))
> -			split_contpmd(pmdp);
> -		ret = split_pmd(pmdp, pmd, gfp, false);
> +			split_contpmd(addr, pmdp);
> +		ret = split_pmd(addr, pmdp, pmd, gfp, false);
>  
>  		/*
>  		 * We have split the pmd directly to ptes so there is no need to
> @@ -868,7 +985,7 @@ static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
>  	pte_t pte = __ptep_get(ptep);
>  
>  	if (pte_cont(pte))
> -		split_contpte(ptep);
> +		split_contpte(addr, ptep);
>  
>  	return 0;
>  }
> @@ -1444,37 +1561,57 @@ static bool pgtable_range_aligned(unsigned long start, unsigned long end,
>  	return true;
>  }
>  
> -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>  				    unsigned long end, bool free_mapped,
>  				    struct vmem_altmap *altmap)
>  {
> -	pte_t *ptep, pte;
> +	pte_t pte;
>  
>  	do {
> -		ptep = pte_offset_kernel(pmdp, addr);
>  		pte = __ptep_get(ptep);
>  		if (pte_none(pte))
>  			continue;
>  
>  		WARN_ON(!pte_present(pte));
>  		__pte_clear(&init_mm, addr, ptep);
> +		dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>  		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>  		if (free_mapped)
>  			free_hotplug_page_range(pte_page(pte),
>  						PAGE_SIZE, altmap);
> -	} while (addr += PAGE_SIZE, addr < end);
> +	} while (ptep++, addr += PAGE_SIZE, addr < end);
> +}
> +
> +static void unmap_hotplug_cont_pte_range(pmd_t *pmdp, unsigned long addr,
> +					 unsigned long end, bool free_mapped,
> +					 struct vmem_altmap *altmap)
> +{
> +	unsigned long next;
> +	pte_t *ptep, pte;
> +
> +	do {
> +		next = pte_cont_addr_end(addr, end);
> +		ptep = pte_offset_kernel(pmdp, addr);
> +		pte = __ptep_get(ptep);
> +
> +		if (pte_present(pte) && pte_cont(pte)) {
> +			dm_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
> +			dm_meminfo_add(addr, CONT_PTE_SIZE, PTE);
> +		}
> +
> +		unmap_hotplug_pte_range(ptep, addr, next, free_mapped, altmap);
> +	} while (addr = next, addr < end);
>  }
>  
> -static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
> +static void unmap_hotplug_pmd_range(pmd_t *pmdp, unsigned long addr,
>  				    unsigned long end, bool free_mapped,
>  				    struct vmem_altmap *altmap)
>  {
>  	unsigned long next;
> -	pmd_t *pmdp, pmd;
> +	pmd_t pmd;
>  
>  	do {
>  		next = pmd_addr_end(addr, end);
> -		pmdp = pmd_offset(pudp, addr);
>  		pmd = READ_ONCE(*pmdp);
>  		if (pmd_none(pmd))
>  			continue;
> @@ -1482,6 +1619,7 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>  		WARN_ON(!pmd_present(pmd));
>  		if (pmd_sect(pmd)) {
>  			pmd_clear(pmdp);
> +			dm_meminfo_sub(addr, PMD_SIZE, PMD);
>  
>  			/*
>  			 * One TLBI should be sufficient here as the PMD_SIZE
> @@ -1494,7 +1632,28 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>  			continue;
>  		}
>  		WARN_ON(!pmd_table(pmd));
> -		unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
> +		unmap_hotplug_cont_pte_range(pmdp, addr, next, free_mapped, altmap);
> +	} while (pmdp++, addr = next, addr < end);
> +}
> +
> +static void unmap_hotplug_cont_pmd_range(pud_t *pudp, unsigned long addr,
> +					 unsigned long end, bool free_mapped,
> +					 struct vmem_altmap *altmap)
> +{
> +	unsigned long next;
> +	pmd_t *pmdp, pmd;
> +
> +	do {
> +		next = pmd_cont_addr_end(addr, end);
> +		pmdp = pmd_offset(pudp, addr);
> +		pmd = READ_ONCE(*pmdp);
> +
> +		if (pmd_leaf(pmd) && pmd_cont(pmd)) {
> +			dm_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
> +			dm_meminfo_add(addr, CONT_PMD_SIZE, PMD);
> +		}
> +
> +		unmap_hotplug_pmd_range(pmdp, addr, next, free_mapped, altmap);
>  	} while (addr = next, addr < end);
>  }
>  
> @@ -1515,6 +1674,7 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
>  		WARN_ON(!pud_present(pud));
>  		if (pud_sect(pud)) {
>  			pud_clear(pudp);
> +			dm_meminfo_sub(addr, PUD_SIZE, PUD);
>  
>  			/*
>  			 * One TLBI should be sufficient here as the PUD_SIZE
> @@ -1527,7 +1687,7 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
>  			continue;
>  		}
>  		WARN_ON(!pud_table(pud));
> -		unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
> +		unmap_hotplug_cont_pmd_range(pudp, addr, next, free_mapped, altmap);
>  	} while (addr = next, addr < end);
>  }
>

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 2 weeks, 1 day ago


On 1/21/26 9:09 PM, Anshuman Khandual wrote:
> Hello Yang,
>
> On 07/01/26 5:59 AM, Yang Shi wrote:
>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>> rodata=full"), the direct mapping may be split on some machines instead
>> keeping static since boot. It makes more sense to show the direct mapping
>> use in /proc/meminfo than before.
> I guess the direct mapping here refers to linear map ? IIUC it is called
> direct map on x86 and linear map on arm64 platforms. Then should not it
> be renamed as s/DirectMap/LinearMap instead ? This will align with names
> from ptdump as well.

Yes, linear map refers to direct map. They are interchangeable in this 
patch. Using "DirectMap" keeps the compatibility with x86.

>
> Before the above mentioned commit, linear could get altered with memory
> hotplug and remove events as well.
>
>> This patch will make /proc/meminfo show the direct mapping use like the
>> below (4K base page size):
>> DirectMap4K:	   94792 kB
>> DirectMap64K:	  134208 kB
>> DirectMap2M:	 1173504 kB
>> DirectMap32M:	 5636096 kB
>> DirectMap1G:	529530880 kB
> If /proc/meminfo interface is getting updated via  arch_report_meminfo()
> why not add stats for all kernel virtual address space ranges including
> vmemmap, vmalloc etc aka all address range headers in ptdump as many of
> those could change during system runtime. What makes linear mapping any
> special ?

Other than what Will suggested, /proc/meminfo does show vmalloc info:

VmallocTotal:   135288315904 kB
VmallocUsed:      114200 kB
VmallocChunk:          0 kB

AFAICT, large block mapping for vmalloc has not been widely used by 
arm64 yet.
"cat /sys/kernel/debug/kernel_page_tables | grep -e BLK" doesn't show 
any large block mapping on my AmpereOne machine with v6.19-rc4 kernel. 
If we get more large block mappings for vmalloc, we can add the mapping 
size info in the future.

Thanks,
Yang

>
>> Although just the machines which support BBML2_NOABORT can split the
>> direct mapping, show it on all machines regardless of BBML2_NOABORT so
>> that the users have consistent view in order to avoid confusion.
>>
>> Although ptdump also can tell the direct map use, but it needs to dump
>> the whole kernel page table. It is costly and overkilling. It is also
>> in debugfs which may not be enabled by all distros. So showing direct
>> map use in /proc/meminfo seems more convenient and has less overhead.
> Agreed a /proc/meminfo based broader kernel virtual address space stats
> display will complement ptdump which provides more granular information
> about their mapping (with additional cost and setup) but it should cover
> all the regions in kernel virtual space.
>
>> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
>> ---
>> v5: * Rebased to v6.19-rc4
>>      * Fixed the build error for !CONFIG_PROC_FS
>> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>>      * Used shorter name for the helpers and variables per Ryan
>>      * Fixed accounting for memory hotunplug
>> v3: * Fixed the over-accounting problems per Ryan
>>      * Introduced helpers for add/sub direct map use and #ifdef them with
>>        CONFIG_PROC_FS per Ryan
>>      * v3 is a fix patch on top of v2
>> v2: * Counted in size instead of the number of entries per Ryan
>>      * Removed shift array per Ryan
>>      * Use lower case "k" per Ryan
>>      * Fixed a couple of build warnings reported by kernel test robot
>>      * Fixed a couple of poential miscounts
>>
>>   arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 181 insertions(+), 21 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 8e1d80a7033e..422441c9a992 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -29,6 +29,7 @@
>>   #include <linux/mm_inline.h>
>>   #include <linux/pagewalk.h>
>>   #include <linux/stop_machine.h>
>> +#include <linux/proc_fs.h>
>>   
>>   #include <asm/barrier.h>
>>   #include <asm/cputype.h>
>> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>>   	dsb(ishst);
>>   }
>>   
>> +enum dm_type {
>> +	PTE,
>> +	CONT_PTE,
>> +	PMD,
>> +	CONT_PMD,
>> +	PUD,
>> +	NR_DM_TYPE,
>> +};
>> +
>> +#ifdef CONFIG_PROC_FS
>> +static unsigned long dm_meminfo[NR_DM_TYPE];
>> +
>> +void arch_report_meminfo(struct seq_file *m)
>> +{
>> +	char *size[NR_DM_TYPE];
>> +
>> +#if defined(CONFIG_ARM64_4K_PAGES)
>> +	size[PTE] = "4k";
>> +	size[CONT_PTE] = "64k";
>> +	size[PMD] = "2M";
>> +	size[CONT_PMD] = "32M";
>> +	size[PUD] = "1G";
>> +#elif defined(CONFIG_ARM64_16K_PAGES)
>> +	size[PTE] = "16k";
>> +	size[CONT_PTE] = "2M";
>> +	size[PMD] = "32M";
>> +	size[CONT_PMD] = "1G";
>> +#elif defined(CONFIG_ARM64_64K_PAGES)
>> +	size[PTE] = "64k";
>> +	size[CONT_PTE] = "2M";
>> +	size[PMD] = "512M";
>> +	size[CONT_PMD] = "16G";
>> +#endif
>> +
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PTE], dm_meminfo[PTE] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[CONT_PTE],
>> +			dm_meminfo[CONT_PTE] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PMD], dm_meminfo[PMD] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[CONT_PMD],
>> +			dm_meminfo[CONT_PMD] >> 10);
>> +	if (pud_sect_supported())
>> +		seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PUD], dm_meminfo[PUD] >> 10);
>> +}
>> +
>> +static inline bool is_dm_addr(unsigned long addr)
>> +{
>> +	return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
>> +}
>> +
>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +	if (is_dm_addr(addr))
>> +		dm_meminfo[type] += size;
>> +}
>> +
>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +	if (is_dm_addr(addr))
>> +		dm_meminfo[type] -= size;
>> +}
>> +#else
>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +}
>> +
>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +}
>> +#endif
>> +
>>   static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>>   		     phys_addr_t phys, pgprot_t prot)
>>   {
>> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>>   
>>   		init_pte(ptep, addr, next, phys, __prot);
>>   
>> +		if (pgprot_val(__prot) & PTE_CONT)
>> +			dm_meminfo_add(addr, (next - addr), CONT_PTE);
>> +		else
>> +			dm_meminfo_add(addr, (next - addr), PTE);
>> +
>>   		ptep += pte_index(next) - pte_index(addr);
>>   		phys += next - addr;
>>   	} while (addr = next, addr != end);
>> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>>   		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>>   			pmd_set_huge(pmdp, phys, prot);
>>   
>> +			/*
>> +			 * It is possible to have mappings allow cont mapping
>> +			 * but disallow block mapping. For example,
>> +			 * map_entry_trampoline().
>> +			 * So we have to increase CONT_PMD and PMD size here
>> +			 * to avoid double counting.
>> +			 */
>> +			if (pgprot_val(prot) & PTE_CONT)
>> +				dm_meminfo_add(addr, (next - addr), CONT_PMD);
>> +			else
>> +				dm_meminfo_add(addr, (next - addr), PMD);
>>   			/*
>>   			 * After the PMD entry has been populated once, we
>>   			 * only allow updates to the permission attributes.
>> @@ -389,6 +485,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>>   			pud_set_huge(pudp, phys, prot);
>>   
>> +			dm_meminfo_add(addr, (next - addr), PUD);
>>   			/*
>>   			 * After the PUD entry has been populated once, we
>>   			 * only allow updates to the permission attributes.
>> @@ -575,16 +672,21 @@ pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
>>   	return  __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
>>   }
>>   
>> -static void split_contpte(pte_t *ptep)
>> +static void split_contpte(unsigned long addr, pte_t *ptep)
>>   {
>>   	int i;
>>   
>> +	dm_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
>> +
>>   	ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
>>   	for (i = 0; i < CONT_PTES; i++, ptep++)
>>   		__set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
>> +
>> +	dm_meminfo_add(addr, CONT_PTE_SIZE, PTE);
>>   }
>>   
>> -static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>> +static int split_pmd(unsigned long addr, pmd_t *pmdp, pmd_t pmd, gfp_t gfp,
>> +		     bool to_cont)
>>   {
>>   	pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
>>   	unsigned long pfn = pmd_pfn(pmd);
>> @@ -606,8 +708,13 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>>   	if (to_cont)
>>   		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>   
>> +	dm_meminfo_sub(addr, PMD_SIZE, PMD);
>>   	for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
>>   		__set_pte(ptep, pfn_pte(pfn, prot));
>> +	if (to_cont)
>> +		dm_meminfo_add(addr, PMD_SIZE, CONT_PTE);
>> +	else
>> +		dm_meminfo_add(addr, PMD_SIZE, PTE);
>>   
>>   	/*
>>   	 * Ensure the pte entries are visible to the table walker by the time
>> @@ -619,16 +726,21 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>>   	return 0;
>>   }
>>   
>> -static void split_contpmd(pmd_t *pmdp)
>> +static void split_contpmd(unsigned long addr, pmd_t *pmdp)
>>   {
>>   	int i;
>>   
>> +	dm_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
>> +
>>   	pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
>>   	for (i = 0; i < CONT_PMDS; i++, pmdp++)
>>   		set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
>> +
>> +	dm_meminfo_add(addr, CONT_PMD_SIZE, PMD);
>>   }
>>   
>> -static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
>> +static int split_pud(unsigned long addr, pud_t *pudp, pud_t pud, gfp_t gfp,
>> +		     bool to_cont)
>>   {
>>   	pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
>>   	unsigned int step = PMD_SIZE >> PAGE_SHIFT;
>> @@ -651,8 +763,13 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
>>   	if (to_cont)
>>   		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>   
>> +	dm_meminfo_sub(addr, PUD_SIZE, PUD);
>>   	for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step)
>>   		set_pmd(pmdp, pfn_pmd(pfn, prot));
>> +	if (to_cont)
>> +		dm_meminfo_add(addr, PUD_SIZE, CONT_PMD);
>> +	else
>> +		dm_meminfo_add(addr, PUD_SIZE, PMD);
>>   
>>   	/*
>>   	 * Ensure the pmd entries are visible to the table walker by the time
>> @@ -707,7 +824,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>>   	if (!pud_present(pud))
>>   		goto out;
>>   	if (pud_leaf(pud)) {
>> -		ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
>> +		ret = split_pud(addr, pudp, pud, GFP_PGTABLE_KERNEL, true);
>>   		if (ret)
>>   			goto out;
>>   	}
>> @@ -725,14 +842,14 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>>   		goto out;
>>   	if (pmd_leaf(pmd)) {
>>   		if (pmd_cont(pmd))
>> -			split_contpmd(pmdp);
>> +			split_contpmd(addr, pmdp);
>>   		/*
>>   		 * PMD: If addr is PMD aligned then addr already describes a
>>   		 * leaf boundary. Otherwise, split to contpte.
>>   		 */
>>   		if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
>>   			goto out;
>> -		ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
>> +		ret = split_pmd(addr, pmdp, pmd, GFP_PGTABLE_KERNEL, true);
>>   		if (ret)
>>   			goto out;
>>   	}
>> @@ -749,7 +866,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>>   	if (!pte_present(pte))
>>   		goto out;
>>   	if (pte_cont(pte))
>> -		split_contpte(ptep);
>> +		split_contpte(addr, ptep);
>>   
>>   out:
>>   	return ret;
>> @@ -835,7 +952,7 @@ static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
>>   	int ret = 0;
>>   
>>   	if (pud_leaf(pud))
>> -		ret = split_pud(pudp, pud, gfp, false);
>> +		ret = split_pud(addr, pudp, pud, gfp, false);
>>   
>>   	return ret;
>>   }
>> @@ -849,8 +966,8 @@ static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
>>   
>>   	if (pmd_leaf(pmd)) {
>>   		if (pmd_cont(pmd))
>> -			split_contpmd(pmdp);
>> -		ret = split_pmd(pmdp, pmd, gfp, false);
>> +			split_contpmd(addr, pmdp);
>> +		ret = split_pmd(addr, pmdp, pmd, gfp, false);
>>   
>>   		/*
>>   		 * We have split the pmd directly to ptes so there is no need to
>> @@ -868,7 +985,7 @@ static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
>>   	pte_t pte = __ptep_get(ptep);
>>   
>>   	if (pte_cont(pte))
>> -		split_contpte(ptep);
>> +		split_contpte(addr, ptep);
>>   
>>   	return 0;
>>   }
>> @@ -1444,37 +1561,57 @@ static bool pgtable_range_aligned(unsigned long start, unsigned long end,
>>   	return true;
>>   }
>>   
>> -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>>   				    unsigned long end, bool free_mapped,
>>   				    struct vmem_altmap *altmap)
>>   {
>> -	pte_t *ptep, pte;
>> +	pte_t pte;
>>   
>>   	do {
>> -		ptep = pte_offset_kernel(pmdp, addr);
>>   		pte = __ptep_get(ptep);
>>   		if (pte_none(pte))
>>   			continue;
>>   
>>   		WARN_ON(!pte_present(pte));
>>   		__pte_clear(&init_mm, addr, ptep);
>> +		dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>>   		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>   		if (free_mapped)
>>   			free_hotplug_page_range(pte_page(pte),
>>   						PAGE_SIZE, altmap);
>> -	} while (addr += PAGE_SIZE, addr < end);
>> +	} while (ptep++, addr += PAGE_SIZE, addr < end);
>> +}
>> +
>> +static void unmap_hotplug_cont_pte_range(pmd_t *pmdp, unsigned long addr,
>> +					 unsigned long end, bool free_mapped,
>> +					 struct vmem_altmap *altmap)
>> +{
>> +	unsigned long next;
>> +	pte_t *ptep, pte;
>> +
>> +	do {
>> +		next = pte_cont_addr_end(addr, end);
>> +		ptep = pte_offset_kernel(pmdp, addr);
>> +		pte = __ptep_get(ptep);
>> +
>> +		if (pte_present(pte) && pte_cont(pte)) {
>> +			dm_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
>> +			dm_meminfo_add(addr, CONT_PTE_SIZE, PTE);
>> +		}
>> +
>> +		unmap_hotplug_pte_range(ptep, addr, next, free_mapped, altmap);
>> +	} while (addr = next, addr < end);
>>   }
>>   
>> -static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>> +static void unmap_hotplug_pmd_range(pmd_t *pmdp, unsigned long addr,
>>   				    unsigned long end, bool free_mapped,
>>   				    struct vmem_altmap *altmap)
>>   {
>>   	unsigned long next;
>> -	pmd_t *pmdp, pmd;
>> +	pmd_t pmd;
>>   
>>   	do {
>>   		next = pmd_addr_end(addr, end);
>> -		pmdp = pmd_offset(pudp, addr);
>>   		pmd = READ_ONCE(*pmdp);
>>   		if (pmd_none(pmd))
>>   			continue;
>> @@ -1482,6 +1619,7 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>>   		WARN_ON(!pmd_present(pmd));
>>   		if (pmd_sect(pmd)) {
>>   			pmd_clear(pmdp);
>> +			dm_meminfo_sub(addr, PMD_SIZE, PMD);
>>   
>>   			/*
>>   			 * One TLBI should be sufficient here as the PMD_SIZE
>> @@ -1494,7 +1632,28 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>>   			continue;
>>   		}
>>   		WARN_ON(!pmd_table(pmd));
>> -		unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
>> +		unmap_hotplug_cont_pte_range(pmdp, addr, next, free_mapped, altmap);
>> +	} while (pmdp++, addr = next, addr < end);
>> +}
>> +
>> +static void unmap_hotplug_cont_pmd_range(pud_t *pudp, unsigned long addr,
>> +					 unsigned long end, bool free_mapped,
>> +					 struct vmem_altmap *altmap)
>> +{
>> +	unsigned long next;
>> +	pmd_t *pmdp, pmd;
>> +
>> +	do {
>> +		next = pmd_cont_addr_end(addr, end);
>> +		pmdp = pmd_offset(pudp, addr);
>> +		pmd = READ_ONCE(*pmdp);
>> +
>> +		if (pmd_leaf(pmd) && pmd_cont(pmd)) {
>> +			dm_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
>> +			dm_meminfo_add(addr, CONT_PMD_SIZE, PMD);
>> +		}
>> +
>> +		unmap_hotplug_pmd_range(pmdp, addr, next, free_mapped, altmap);
>>   	} while (addr = next, addr < end);
>>   }
>>   
>> @@ -1515,6 +1674,7 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
>>   		WARN_ON(!pud_present(pud));
>>   		if (pud_sect(pud)) {
>>   			pud_clear(pudp);
>> +			dm_meminfo_sub(addr, PUD_SIZE, PUD);
>>   
>>   			/*
>>   			 * One TLBI should be sufficient here as the PUD_SIZE
>> @@ -1527,7 +1687,7 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
>>   			continue;
>>   		}
>>   		WARN_ON(!pud_table(pud));
>> -		unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
>> +		unmap_hotplug_cont_pmd_range(pudp, addr, next, free_mapped, altmap);
>>   	} while (addr = next, addr < end);
>>   }
>>

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 2 weeks ago

Oops, it looks like the similar response was sent twice. My email server 
had some hiccups yesterday, I thought the first one was failed to send, 
so I sent the other one. But it looks like both went through.

Anyway both are basically same. Sorry for the confusion.

Thanks,
Yang


On 1/22/26 1:41 PM, Yang Shi wrote:
>
>
> On 1/21/26 9:09 PM, Anshuman Khandual wrote:
>> Hello Yang,
>>
>> On 07/01/26 5:59 AM, Yang Shi wrote:
>>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>>> rodata=full"), the direct mapping may be split on some machines instead
>>> keeping static since boot. It makes more sense to show the direct 
>>> mapping
>>> use in /proc/meminfo than before.
>> I guess the direct mapping here refers to linear map ? IIUC it is called
>> direct map on x86 and linear map on arm64 platforms. Then should not it
>> be renamed as s/DirectMap/LinearMap instead ? This will align with names
>> from ptdump as well.
>
> Yes, linear map refers to direct map. They are interchangeable in this 
> patch. Using "DirectMap" keeps the compatibility with x86.
>
>>
>> Before the above mentioned commit, linear could get altered with memory
>> hotplug and remove events as well.
>>
>>> This patch will make /proc/meminfo show the direct mapping use like the
>>> below (4K base page size):
>>> DirectMap4K:       94792 kB
>>> DirectMap64K:      134208 kB
>>> DirectMap2M:     1173504 kB
>>> DirectMap32M:     5636096 kB
>>> DirectMap1G:    529530880 kB
>> If /proc/meminfo interface is getting updated via arch_report_meminfo()
>> why not add stats for all kernel virtual address space ranges including
>> vmemmap, vmalloc etc aka all address range headers in ptdump as many of
>> those could change during system runtime. What makes linear mapping any
>> special ?
>
> Other than what Will suggested, /proc/meminfo does show vmalloc info:
>
> VmallocTotal:   135288315904 kB
> VmallocUsed:      114200 kB
> VmallocChunk:          0 kB
>
> AFAICT, large block mapping for vmalloc has not been widely used by 
> arm64 yet.
> "cat /sys/kernel/debug/kernel_page_tables | grep -e BLK" doesn't show 
> any large block mapping on my AmpereOne machine with v6.19-rc4 kernel. 
> If we get more large block mappings for vmalloc, we can add the 
> mapping size info in the future.
>
> Thanks,
> Yang
>
>>
>>> Although just the machines which support BBML2_NOABORT can split the
>>> direct mapping, show it on all machines regardless of BBML2_NOABORT so
>>> that the users have consistent view in order to avoid confusion.
>>>
>>> Although ptdump also can tell the direct map use, but it needs to dump
>>> the whole kernel page table. It is costly and overkilling. It is also
>>> in debugfs which may not be enabled by all distros. So showing direct
>>> map use in /proc/meminfo seems more convenient and has less overhead.
>> Agreed a /proc/meminfo based broader kernel virtual address space stats
>> display will complement ptdump which provides more granular information
>> about their mapping (with additional cost and setup) but it should cover
>> all the regions in kernel virtual space.
>>
>>> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
>>> ---
>>> v5: * Rebased to v6.19-rc4
>>>      * Fixed the build error for !CONFIG_PROC_FS
>>> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>>>      * Used shorter name for the helpers and variables per Ryan
>>>      * Fixed accounting for memory hotunplug
>>> v3: * Fixed the over-accounting problems per Ryan
>>>      * Introduced helpers for add/sub direct map use and #ifdef them 
>>> with
>>>        CONFIG_PROC_FS per Ryan
>>>      * v3 is a fix patch on top of v2
>>> v2: * Counted in size instead of the number of entries per Ryan
>>>      * Removed shift array per Ryan
>>>      * Use lower case "k" per Ryan
>>>      * Fixed a couple of build warnings reported by kernel test robot
>>>      * Fixed a couple of poential miscounts
>>>
>>>   arch/arm64/mm/mmu.c | 202 
>>> +++++++++++++++++++++++++++++++++++++++-----
>>>   1 file changed, 181 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>>> index 8e1d80a7033e..422441c9a992 100644
>>> --- a/arch/arm64/mm/mmu.c
>>> +++ b/arch/arm64/mm/mmu.c
>>> @@ -29,6 +29,7 @@
>>>   #include <linux/mm_inline.h>
>>>   #include <linux/pagewalk.h>
>>>   #include <linux/stop_machine.h>
>>> +#include <linux/proc_fs.h>
>>>     #include <asm/barrier.h>
>>>   #include <asm/cputype.h>
>>> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>>>       dsb(ishst);
>>>   }
>>>   +enum dm_type {
>>> +    PTE,
>>> +    CONT_PTE,
>>> +    PMD,
>>> +    CONT_PMD,
>>> +    PUD,
>>> +    NR_DM_TYPE,
>>> +};
>>> +
>>> +#ifdef CONFIG_PROC_FS
>>> +static unsigned long dm_meminfo[NR_DM_TYPE];
>>> +
>>> +void arch_report_meminfo(struct seq_file *m)
>>> +{
>>> +    char *size[NR_DM_TYPE];
>>> +
>>> +#if defined(CONFIG_ARM64_4K_PAGES)
>>> +    size[PTE] = "4k";
>>> +    size[CONT_PTE] = "64k";
>>> +    size[PMD] = "2M";
>>> +    size[CONT_PMD] = "32M";
>>> +    size[PUD] = "1G";
>>> +#elif defined(CONFIG_ARM64_16K_PAGES)
>>> +    size[PTE] = "16k";
>>> +    size[CONT_PTE] = "2M";
>>> +    size[PMD] = "32M";
>>> +    size[CONT_PMD] = "1G";
>>> +#elif defined(CONFIG_ARM64_64K_PAGES)
>>> +    size[PTE] = "64k";
>>> +    size[CONT_PTE] = "2M";
>>> +    size[PMD] = "512M";
>>> +    size[CONT_PMD] = "16G";
>>> +#endif
>>> +
>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>> +            size[PTE], dm_meminfo[PTE] >> 10);
>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>> +            size[CONT_PTE],
>>> +            dm_meminfo[CONT_PTE] >> 10);
>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>> +            size[PMD], dm_meminfo[PMD] >> 10);
>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>> +            size[CONT_PMD],
>>> +            dm_meminfo[CONT_PMD] >> 10);
>>> +    if (pud_sect_supported())
>>> +        seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>> +            size[PUD], dm_meminfo[PUD] >> 10);
>>> +}
>>> +
>>> +static inline bool is_dm_addr(unsigned long addr)
>>> +{
>>> +    return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
>>> +}
>>> +
>>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long 
>>> size,
>>> +                  enum dm_type type)
>>> +{
>>> +    if (is_dm_addr(addr))
>>> +        dm_meminfo[type] += size;
>>> +}
>>> +
>>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long 
>>> size,
>>> +                  enum dm_type type)
>>> +{
>>> +    if (is_dm_addr(addr))
>>> +        dm_meminfo[type] -= size;
>>> +}
>>> +#else
>>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long 
>>> size,
>>> +                  enum dm_type type)
>>> +{
>>> +}
>>> +
>>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long 
>>> size,
>>> +                  enum dm_type type)
>>> +{
>>> +}
>>> +#endif
>>> +
>>>   static void init_pte(pte_t *ptep, unsigned long addr, unsigned 
>>> long end,
>>>                phys_addr_t phys, pgprot_t prot)
>>>   {
>>> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, 
>>> unsigned long addr,
>>>             init_pte(ptep, addr, next, phys, __prot);
>>>   +        if (pgprot_val(__prot) & PTE_CONT)
>>> +            dm_meminfo_add(addr, (next - addr), CONT_PTE);
>>> +        else
>>> +            dm_meminfo_add(addr, (next - addr), PTE);
>>> +
>>>           ptep += pte_index(next) - pte_index(addr);
>>>           phys += next - addr;
>>>       } while (addr = next, addr != end);
>>> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long 
>>> addr, unsigned long end,
>>>               (flags & NO_BLOCK_MAPPINGS) == 0) {
>>>               pmd_set_huge(pmdp, phys, prot);
>>>   +            /*
>>> +             * It is possible to have mappings allow cont mapping
>>> +             * but disallow block mapping. For example,
>>> +             * map_entry_trampoline().
>>> +             * So we have to increase CONT_PMD and PMD size here
>>> +             * to avoid double counting.
>>> +             */
>>> +            if (pgprot_val(prot) & PTE_CONT)
>>> +                dm_meminfo_add(addr, (next - addr), CONT_PMD);
>>> +            else
>>> +                dm_meminfo_add(addr, (next - addr), PMD);
>>>               /*
>>>                * After the PMD entry has been populated once, we
>>>                * only allow updates to the permission attributes.
>>> @@ -389,6 +485,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned 
>>> long addr, unsigned long end,
>>>               (flags & NO_BLOCK_MAPPINGS) == 0) {
>>>               pud_set_huge(pudp, phys, prot);
>>>   +            dm_meminfo_add(addr, (next - addr), PUD);
>>>               /*
>>>                * After the PUD entry has been populated once, we
>>>                * only allow updates to the permission attributes.
>>> @@ -575,16 +672,21 @@ pgd_pgtable_alloc_special_mm(enum pgtable_type 
>>> pgtable_type)
>>>       return  __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, 
>>> pgtable_type);
>>>   }
>>>   -static void split_contpte(pte_t *ptep)
>>> +static void split_contpte(unsigned long addr, pte_t *ptep)
>>>   {
>>>       int i;
>>>   +    dm_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
>>> +
>>>       ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
>>>       for (i = 0; i < CONT_PTES; i++, ptep++)
>>>           __set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
>>> +
>>> +    dm_meminfo_add(addr, CONT_PTE_SIZE, PTE);
>>>   }
>>>   -static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool 
>>> to_cont)
>>> +static int split_pmd(unsigned long addr, pmd_t *pmdp, pmd_t pmd, 
>>> gfp_t gfp,
>>> +             bool to_cont)
>>>   {
>>>       pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | 
>>> PMD_TABLE_AF;
>>>       unsigned long pfn = pmd_pfn(pmd);
>>> @@ -606,8 +708,13 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, 
>>> gfp_t gfp, bool to_cont)
>>>       if (to_cont)
>>>           prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>>   +    dm_meminfo_sub(addr, PMD_SIZE, PMD);
>>>       for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
>>>           __set_pte(ptep, pfn_pte(pfn, prot));
>>> +    if (to_cont)
>>> +        dm_meminfo_add(addr, PMD_SIZE, CONT_PTE);
>>> +    else
>>> +        dm_meminfo_add(addr, PMD_SIZE, PTE);
>>>         /*
>>>        * Ensure the pte entries are visible to the table walker by 
>>> the time
>>> @@ -619,16 +726,21 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, 
>>> gfp_t gfp, bool to_cont)
>>>       return 0;
>>>   }
>>>   -static void split_contpmd(pmd_t *pmdp)
>>> +static void split_contpmd(unsigned long addr, pmd_t *pmdp)
>>>   {
>>>       int i;
>>>   +    dm_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
>>> +
>>>       pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
>>>       for (i = 0; i < CONT_PMDS; i++, pmdp++)
>>>           set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
>>> +
>>> +    dm_meminfo_add(addr, CONT_PMD_SIZE, PMD);
>>>   }
>>>   -static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool 
>>> to_cont)
>>> +static int split_pud(unsigned long addr, pud_t *pudp, pud_t pud, 
>>> gfp_t gfp,
>>> +             bool to_cont)
>>>   {
>>>       pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | 
>>> PUD_TABLE_AF;
>>>       unsigned int step = PMD_SIZE >> PAGE_SHIFT;
>>> @@ -651,8 +763,13 @@ static int split_pud(pud_t *pudp, pud_t pud, 
>>> gfp_t gfp, bool to_cont)
>>>       if (to_cont)
>>>           prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>>   +    dm_meminfo_sub(addr, PUD_SIZE, PUD);
>>>       for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step)
>>>           set_pmd(pmdp, pfn_pmd(pfn, prot));
>>> +    if (to_cont)
>>> +        dm_meminfo_add(addr, PUD_SIZE, CONT_PMD);
>>> +    else
>>> +        dm_meminfo_add(addr, PUD_SIZE, PMD);
>>>         /*
>>>        * Ensure the pmd entries are visible to the table walker by 
>>> the time
>>> @@ -707,7 +824,7 @@ static int 
>>> split_kernel_leaf_mapping_locked(unsigned long addr)
>>>       if (!pud_present(pud))
>>>           goto out;
>>>       if (pud_leaf(pud)) {
>>> -        ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
>>> +        ret = split_pud(addr, pudp, pud, GFP_PGTABLE_KERNEL, true);
>>>           if (ret)
>>>               goto out;
>>>       }
>>> @@ -725,14 +842,14 @@ static int 
>>> split_kernel_leaf_mapping_locked(unsigned long addr)
>>>           goto out;
>>>       if (pmd_leaf(pmd)) {
>>>           if (pmd_cont(pmd))
>>> -            split_contpmd(pmdp);
>>> +            split_contpmd(addr, pmdp);
>>>           /*
>>>            * PMD: If addr is PMD aligned then addr already describes a
>>>            * leaf boundary. Otherwise, split to contpte.
>>>            */
>>>           if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
>>>               goto out;
>>> -        ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
>>> +        ret = split_pmd(addr, pmdp, pmd, GFP_PGTABLE_KERNEL, true);
>>>           if (ret)
>>>               goto out;
>>>       }
>>> @@ -749,7 +866,7 @@ static int 
>>> split_kernel_leaf_mapping_locked(unsigned long addr)
>>>       if (!pte_present(pte))
>>>           goto out;
>>>       if (pte_cont(pte))
>>> -        split_contpte(ptep);
>>> +        split_contpte(addr, ptep);
>>>     out:
>>>       return ret;
>>> @@ -835,7 +952,7 @@ static int split_to_ptes_pud_entry(pud_t *pudp, 
>>> unsigned long addr,
>>>       int ret = 0;
>>>         if (pud_leaf(pud))
>>> -        ret = split_pud(pudp, pud, gfp, false);
>>> +        ret = split_pud(addr, pudp, pud, gfp, false);
>>>         return ret;
>>>   }
>>> @@ -849,8 +966,8 @@ static int split_to_ptes_pmd_entry(pmd_t *pmdp, 
>>> unsigned long addr,
>>>         if (pmd_leaf(pmd)) {
>>>           if (pmd_cont(pmd))
>>> -            split_contpmd(pmdp);
>>> -        ret = split_pmd(pmdp, pmd, gfp, false);
>>> +            split_contpmd(addr, pmdp);
>>> +        ret = split_pmd(addr, pmdp, pmd, gfp, false);
>>>             /*
>>>            * We have split the pmd directly to ptes so there is no 
>>> need to
>>> @@ -868,7 +985,7 @@ static int split_to_ptes_pte_entry(pte_t *ptep, 
>>> unsigned long addr,
>>>       pte_t pte = __ptep_get(ptep);
>>>         if (pte_cont(pte))
>>> -        split_contpte(ptep);
>>> +        split_contpte(addr, ptep);
>>>         return 0;
>>>   }
>>> @@ -1444,37 +1561,57 @@ static bool pgtable_range_aligned(unsigned 
>>> long start, unsigned long end,
>>>       return true;
>>>   }
>>>   -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>>> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>>>                       unsigned long end, bool free_mapped,
>>>                       struct vmem_altmap *altmap)
>>>   {
>>> -    pte_t *ptep, pte;
>>> +    pte_t pte;
>>>         do {
>>> -        ptep = pte_offset_kernel(pmdp, addr);
>>>           pte = __ptep_get(ptep);
>>>           if (pte_none(pte))
>>>               continue;
>>>             WARN_ON(!pte_present(pte));
>>>           __pte_clear(&init_mm, addr, ptep);
>>> +        dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>>>           flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>>           if (free_mapped)
>>>               free_hotplug_page_range(pte_page(pte),
>>>                           PAGE_SIZE, altmap);
>>> -    } while (addr += PAGE_SIZE, addr < end);
>>> +    } while (ptep++, addr += PAGE_SIZE, addr < end);
>>> +}
>>> +
>>> +static void unmap_hotplug_cont_pte_range(pmd_t *pmdp, unsigned long 
>>> addr,
>>> +                     unsigned long end, bool free_mapped,
>>> +                     struct vmem_altmap *altmap)
>>> +{
>>> +    unsigned long next;
>>> +    pte_t *ptep, pte;
>>> +
>>> +    do {
>>> +        next = pte_cont_addr_end(addr, end);
>>> +        ptep = pte_offset_kernel(pmdp, addr);
>>> +        pte = __ptep_get(ptep);
>>> +
>>> +        if (pte_present(pte) && pte_cont(pte)) {
>>> +            dm_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
>>> +            dm_meminfo_add(addr, CONT_PTE_SIZE, PTE);
>>> +        }
>>> +
>>> +        unmap_hotplug_pte_range(ptep, addr, next, free_mapped, 
>>> altmap);
>>> +    } while (addr = next, addr < end);
>>>   }
>>>   -static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>>> +static void unmap_hotplug_pmd_range(pmd_t *pmdp, unsigned long addr,
>>>                       unsigned long end, bool free_mapped,
>>>                       struct vmem_altmap *altmap)
>>>   {
>>>       unsigned long next;
>>> -    pmd_t *pmdp, pmd;
>>> +    pmd_t pmd;
>>>         do {
>>>           next = pmd_addr_end(addr, end);
>>> -        pmdp = pmd_offset(pudp, addr);
>>>           pmd = READ_ONCE(*pmdp);
>>>           if (pmd_none(pmd))
>>>               continue;
>>> @@ -1482,6 +1619,7 @@ static void unmap_hotplug_pmd_range(pud_t 
>>> *pudp, unsigned long addr,
>>>           WARN_ON(!pmd_present(pmd));
>>>           if (pmd_sect(pmd)) {
>>>               pmd_clear(pmdp);
>>> +            dm_meminfo_sub(addr, PMD_SIZE, PMD);
>>>                 /*
>>>                * One TLBI should be sufficient here as the PMD_SIZE
>>> @@ -1494,7 +1632,28 @@ static void unmap_hotplug_pmd_range(pud_t 
>>> *pudp, unsigned long addr,
>>>               continue;
>>>           }
>>>           WARN_ON(!pmd_table(pmd));
>>> -        unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, 
>>> altmap);
>>> +        unmap_hotplug_cont_pte_range(pmdp, addr, next, free_mapped, 
>>> altmap);
>>> +    } while (pmdp++, addr = next, addr < end);
>>> +}
>>> +
>>> +static void unmap_hotplug_cont_pmd_range(pud_t *pudp, unsigned long 
>>> addr,
>>> +                     unsigned long end, bool free_mapped,
>>> +                     struct vmem_altmap *altmap)
>>> +{
>>> +    unsigned long next;
>>> +    pmd_t *pmdp, pmd;
>>> +
>>> +    do {
>>> +        next = pmd_cont_addr_end(addr, end);
>>> +        pmdp = pmd_offset(pudp, addr);
>>> +        pmd = READ_ONCE(*pmdp);
>>> +
>>> +        if (pmd_leaf(pmd) && pmd_cont(pmd)) {
>>> +            dm_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
>>> +            dm_meminfo_add(addr, CONT_PMD_SIZE, PMD);
>>> +        }
>>> +
>>> +        unmap_hotplug_pmd_range(pmdp, addr, next, free_mapped, 
>>> altmap);
>>>       } while (addr = next, addr < end);
>>>   }
>>>   @@ -1515,6 +1674,7 @@ static void unmap_hotplug_pud_range(p4d_t 
>>> *p4dp, unsigned long addr,
>>>           WARN_ON(!pud_present(pud));
>>>           if (pud_sect(pud)) {
>>>               pud_clear(pudp);
>>> +            dm_meminfo_sub(addr, PUD_SIZE, PUD);
>>>                 /*
>>>                * One TLBI should be sufficient here as the PUD_SIZE
>>> @@ -1527,7 +1687,7 @@ static void unmap_hotplug_pud_range(p4d_t 
>>> *p4dp, unsigned long addr,
>>>               continue;
>>>           }
>>>           WARN_ON(!pud_table(pud));
>>> -        unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, 
>>> altmap);
>>> +        unmap_hotplug_cont_pmd_range(pudp, addr, next, free_mapped, 
>>> altmap);
>>>       } while (addr = next, addr < end);
>>>   }
>
>

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 2 weeks, 1 day ago


On 1/21/26 9:09 PM, Anshuman Khandual wrote:
> Hello Yang,
>
> On 07/01/26 5:59 AM, Yang Shi wrote:
>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>> rodata=full"), the direct mapping may be split on some machines instead
>> keeping static since boot. It makes more sense to show the direct mapping
>> use in /proc/meminfo than before.
> I guess the direct mapping here refers to linear map ? IIUC it is called
> direct map on x86 and linear map on arm64 platforms. Then should not it
> be renamed as s/DirectMap/LinearMap instead ? This will align with names
> from ptdump as well.
>
> Before the above mentioned commit, linear could get altered with memory
> hotplug and remove events as well.

Yes, direct mapping and linear mapping are interchangeable in this 
series. Using "DirectMap" in /proc/meminfo keeps the compatibility with x86.

>
>> This patch will make /proc/meminfo show the direct mapping use like the
>> below (4K base page size):
>> DirectMap4K:	   94792 kB
>> DirectMap64K:	  134208 kB
>> DirectMap2M:	 1173504 kB
>> DirectMap32M:	 5636096 kB
>> DirectMap1G:	529530880 kB
> If /proc/meminfo interface is getting updated via  arch_report_meminfo()
> why not add stats for all kernel virtual address space ranges including
> vmemmap, vmalloc etc aka all address range headers in ptdump as many of
> those could change during system runtime. What makes linear mapping any
> special ?

/proc/meminfo does show vmalloc info:

VmallocTotal:   135288315904 kB
VmallocUsed:      115664 kB
VmallocChunk:          0 kB

And /proc/vmallocinfo gives much more details.

Or you meant showing something like Vmalloc4K/Vmalloc2M? It may be 
useful. But AFAICT, large vmalloc mapping has not been widely used on 
arm64 yet.
"cat /sys/kernel/debug/kernel_page_tables | grep -e BLK" on my AmpereOne 
machine (v6.19-rc4 kernel) doesn't show any large block mapping in 
vmalloc area except the area used by kernel image.

And I agree with Will it should not be part of this patch. We can add it 
in the future if it turns out to be useful.

Thanks,
Yang

>
>> Although just the machines which support BBML2_NOABORT can split the
>> direct mapping, show it on all machines regardless of BBML2_NOABORT so
>> that the users have consistent view in order to avoid confusion.
>>
>> Although ptdump also can tell the direct map use, but it needs to dump
>> the whole kernel page table. It is costly and overkilling. It is also
>> in debugfs which may not be enabled by all distros. So showing direct
>> map use in /proc/meminfo seems more convenient and has less overhead.
> Agreed a /proc/meminfo based broader kernel virtual address space stats
> display will complement ptdump which provides more granular information
> about their mapping (with additional cost and setup) but it should cover
> all the regions in kernel virtual space.
>
>> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
>> ---
>> v5: * Rebased to v6.19-rc4
>>      * Fixed the build error for !CONFIG_PROC_FS
>> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>>      * Used shorter name for the helpers and variables per Ryan
>>      * Fixed accounting for memory hotunplug
>> v3: * Fixed the over-accounting problems per Ryan
>>      * Introduced helpers for add/sub direct map use and #ifdef them with
>>        CONFIG_PROC_FS per Ryan
>>      * v3 is a fix patch on top of v2
>> v2: * Counted in size instead of the number of entries per Ryan
>>      * Removed shift array per Ryan
>>      * Use lower case "k" per Ryan
>>      * Fixed a couple of build warnings reported by kernel test robot
>>      * Fixed a couple of poential miscounts
>>
>>   arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 181 insertions(+), 21 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 8e1d80a7033e..422441c9a992 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -29,6 +29,7 @@
>>   #include <linux/mm_inline.h>
>>   #include <linux/pagewalk.h>
>>   #include <linux/stop_machine.h>
>> +#include <linux/proc_fs.h>
>>   
>>   #include <asm/barrier.h>
>>   #include <asm/cputype.h>
>> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>>   	dsb(ishst);
>>   }
>>   
>> +enum dm_type {
>> +	PTE,
>> +	CONT_PTE,
>> +	PMD,
>> +	CONT_PMD,
>> +	PUD,
>> +	NR_DM_TYPE,
>> +};
>> +
>> +#ifdef CONFIG_PROC_FS
>> +static unsigned long dm_meminfo[NR_DM_TYPE];
>> +
>> +void arch_report_meminfo(struct seq_file *m)
>> +{
>> +	char *size[NR_DM_TYPE];
>> +
>> +#if defined(CONFIG_ARM64_4K_PAGES)
>> +	size[PTE] = "4k";
>> +	size[CONT_PTE] = "64k";
>> +	size[PMD] = "2M";
>> +	size[CONT_PMD] = "32M";
>> +	size[PUD] = "1G";
>> +#elif defined(CONFIG_ARM64_16K_PAGES)
>> +	size[PTE] = "16k";
>> +	size[CONT_PTE] = "2M";
>> +	size[PMD] = "32M";
>> +	size[CONT_PMD] = "1G";
>> +#elif defined(CONFIG_ARM64_64K_PAGES)
>> +	size[PTE] = "64k";
>> +	size[CONT_PTE] = "2M";
>> +	size[PMD] = "512M";
>> +	size[CONT_PMD] = "16G";
>> +#endif
>> +
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PTE], dm_meminfo[PTE] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[CONT_PTE],
>> +			dm_meminfo[CONT_PTE] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PMD], dm_meminfo[PMD] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[CONT_PMD],
>> +			dm_meminfo[CONT_PMD] >> 10);
>> +	if (pud_sect_supported())
>> +		seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PUD], dm_meminfo[PUD] >> 10);
>> +}
>> +
>> +static inline bool is_dm_addr(unsigned long addr)
>> +{
>> +	return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
>> +}
>> +
>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +	if (is_dm_addr(addr))
>> +		dm_meminfo[type] += size;
>> +}
>> +
>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +	if (is_dm_addr(addr))
>> +		dm_meminfo[type] -= size;
>> +}
>> +#else
>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +}
>> +
>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +}
>> +#endif
>> +
>>   static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>>   		     phys_addr_t phys, pgprot_t prot)
>>   {
>> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>>   
>>   		init_pte(ptep, addr, next, phys, __prot);
>>   
>> +		if (pgprot_val(__prot) & PTE_CONT)
>> +			dm_meminfo_add(addr, (next - addr), CONT_PTE);
>> +		else
>> +			dm_meminfo_add(addr, (next - addr), PTE);
>> +
>>   		ptep += pte_index(next) - pte_index(addr);
>>   		phys += next - addr;
>>   	} while (addr = next, addr != end);
>> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>>   		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>>   			pmd_set_huge(pmdp, phys, prot);
>>   
>> +			/*
>> +			 * It is possible to have mappings allow cont mapping
>> +			 * but disallow block mapping. For example,
>> +			 * map_entry_trampoline().
>> +			 * So we have to increase CONT_PMD and PMD size here
>> +			 * to avoid double counting.
>> +			 */
>> +			if (pgprot_val(prot) & PTE_CONT)
>> +				dm_meminfo_add(addr, (next - addr), CONT_PMD);
>> +			else
>> +				dm_meminfo_add(addr, (next - addr), PMD);
>>   			/*
>>   			 * After the PMD entry has been populated once, we
>>   			 * only allow updates to the permission attributes.
>> @@ -389,6 +485,7 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>>   			pud_set_huge(pudp, phys, prot);
>>   
>> +			dm_meminfo_add(addr, (next - addr), PUD);
>>   			/*
>>   			 * After the PUD entry has been populated once, we
>>   			 * only allow updates to the permission attributes.
>> @@ -575,16 +672,21 @@ pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
>>   	return  __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
>>   }
>>   
>> -static void split_contpte(pte_t *ptep)
>> +static void split_contpte(unsigned long addr, pte_t *ptep)
>>   {
>>   	int i;
>>   
>> +	dm_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
>> +
>>   	ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
>>   	for (i = 0; i < CONT_PTES; i++, ptep++)
>>   		__set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
>> +
>> +	dm_meminfo_add(addr, CONT_PTE_SIZE, PTE);
>>   }
>>   
>> -static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>> +static int split_pmd(unsigned long addr, pmd_t *pmdp, pmd_t pmd, gfp_t gfp,
>> +		     bool to_cont)
>>   {
>>   	pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
>>   	unsigned long pfn = pmd_pfn(pmd);
>> @@ -606,8 +708,13 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>>   	if (to_cont)
>>   		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>   
>> +	dm_meminfo_sub(addr, PMD_SIZE, PMD);
>>   	for (i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
>>   		__set_pte(ptep, pfn_pte(pfn, prot));
>> +	if (to_cont)
>> +		dm_meminfo_add(addr, PMD_SIZE, CONT_PTE);
>> +	else
>> +		dm_meminfo_add(addr, PMD_SIZE, PTE);
>>   
>>   	/*
>>   	 * Ensure the pte entries are visible to the table walker by the time
>> @@ -619,16 +726,21 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
>>   	return 0;
>>   }
>>   
>> -static void split_contpmd(pmd_t *pmdp)
>> +static void split_contpmd(unsigned long addr, pmd_t *pmdp)
>>   {
>>   	int i;
>>   
>> +	dm_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
>> +
>>   	pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
>>   	for (i = 0; i < CONT_PMDS; i++, pmdp++)
>>   		set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
>> +
>> +	dm_meminfo_add(addr, CONT_PMD_SIZE, PMD);
>>   }
>>   
>> -static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
>> +static int split_pud(unsigned long addr, pud_t *pudp, pud_t pud, gfp_t gfp,
>> +		     bool to_cont)
>>   {
>>   	pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
>>   	unsigned int step = PMD_SIZE >> PAGE_SHIFT;
>> @@ -651,8 +763,13 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
>>   	if (to_cont)
>>   		prot = __pgprot(pgprot_val(prot) | PTE_CONT);
>>   
>> +	dm_meminfo_sub(addr, PUD_SIZE, PUD);
>>   	for (i = 0; i < PTRS_PER_PMD; i++, pmdp++, pfn += step)
>>   		set_pmd(pmdp, pfn_pmd(pfn, prot));
>> +	if (to_cont)
>> +		dm_meminfo_add(addr, PUD_SIZE, CONT_PMD);
>> +	else
>> +		dm_meminfo_add(addr, PUD_SIZE, PMD);
>>   
>>   	/*
>>   	 * Ensure the pmd entries are visible to the table walker by the time
>> @@ -707,7 +824,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>>   	if (!pud_present(pud))
>>   		goto out;
>>   	if (pud_leaf(pud)) {
>> -		ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
>> +		ret = split_pud(addr, pudp, pud, GFP_PGTABLE_KERNEL, true);
>>   		if (ret)
>>   			goto out;
>>   	}
>> @@ -725,14 +842,14 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>>   		goto out;
>>   	if (pmd_leaf(pmd)) {
>>   		if (pmd_cont(pmd))
>> -			split_contpmd(pmdp);
>> +			split_contpmd(addr, pmdp);
>>   		/*
>>   		 * PMD: If addr is PMD aligned then addr already describes a
>>   		 * leaf boundary. Otherwise, split to contpte.
>>   		 */
>>   		if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
>>   			goto out;
>> -		ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
>> +		ret = split_pmd(addr, pmdp, pmd, GFP_PGTABLE_KERNEL, true);
>>   		if (ret)
>>   			goto out;
>>   	}
>> @@ -749,7 +866,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
>>   	if (!pte_present(pte))
>>   		goto out;
>>   	if (pte_cont(pte))
>> -		split_contpte(ptep);
>> +		split_contpte(addr, ptep);
>>   
>>   out:
>>   	return ret;
>> @@ -835,7 +952,7 @@ static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
>>   	int ret = 0;
>>   
>>   	if (pud_leaf(pud))
>> -		ret = split_pud(pudp, pud, gfp, false);
>> +		ret = split_pud(addr, pudp, pud, gfp, false);
>>   
>>   	return ret;
>>   }
>> @@ -849,8 +966,8 @@ static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
>>   
>>   	if (pmd_leaf(pmd)) {
>>   		if (pmd_cont(pmd))
>> -			split_contpmd(pmdp);
>> -		ret = split_pmd(pmdp, pmd, gfp, false);
>> +			split_contpmd(addr, pmdp);
>> +		ret = split_pmd(addr, pmdp, pmd, gfp, false);
>>   
>>   		/*
>>   		 * We have split the pmd directly to ptes so there is no need to
>> @@ -868,7 +985,7 @@ static int split_to_ptes_pte_entry(pte_t *ptep, unsigned long addr,
>>   	pte_t pte = __ptep_get(ptep);
>>   
>>   	if (pte_cont(pte))
>> -		split_contpte(ptep);
>> +		split_contpte(addr, ptep);
>>   
>>   	return 0;
>>   }
>> @@ -1444,37 +1561,57 @@ static bool pgtable_range_aligned(unsigned long start, unsigned long end,
>>   	return true;
>>   }
>>   
>> -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>>   				    unsigned long end, bool free_mapped,
>>   				    struct vmem_altmap *altmap)
>>   {
>> -	pte_t *ptep, pte;
>> +	pte_t pte;
>>   
>>   	do {
>> -		ptep = pte_offset_kernel(pmdp, addr);
>>   		pte = __ptep_get(ptep);
>>   		if (pte_none(pte))
>>   			continue;
>>   
>>   		WARN_ON(!pte_present(pte));
>>   		__pte_clear(&init_mm, addr, ptep);
>> +		dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>>   		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>   		if (free_mapped)
>>   			free_hotplug_page_range(pte_page(pte),
>>   						PAGE_SIZE, altmap);
>> -	} while (addr += PAGE_SIZE, addr < end);
>> +	} while (ptep++, addr += PAGE_SIZE, addr < end);
>> +}
>> +
>> +static void unmap_hotplug_cont_pte_range(pmd_t *pmdp, unsigned long addr,
>> +					 unsigned long end, bool free_mapped,
>> +					 struct vmem_altmap *altmap)
>> +{
>> +	unsigned long next;
>> +	pte_t *ptep, pte;
>> +
>> +	do {
>> +		next = pte_cont_addr_end(addr, end);
>> +		ptep = pte_offset_kernel(pmdp, addr);
>> +		pte = __ptep_get(ptep);
>> +
>> +		if (pte_present(pte) && pte_cont(pte)) {
>> +			dm_meminfo_sub(addr, CONT_PTE_SIZE, CONT_PTE);
>> +			dm_meminfo_add(addr, CONT_PTE_SIZE, PTE);
>> +		}
>> +
>> +		unmap_hotplug_pte_range(ptep, addr, next, free_mapped, altmap);
>> +	} while (addr = next, addr < end);
>>   }
>>   
>> -static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>> +static void unmap_hotplug_pmd_range(pmd_t *pmdp, unsigned long addr,
>>   				    unsigned long end, bool free_mapped,
>>   				    struct vmem_altmap *altmap)
>>   {
>>   	unsigned long next;
>> -	pmd_t *pmdp, pmd;
>> +	pmd_t pmd;
>>   
>>   	do {
>>   		next = pmd_addr_end(addr, end);
>> -		pmdp = pmd_offset(pudp, addr);
>>   		pmd = READ_ONCE(*pmdp);
>>   		if (pmd_none(pmd))
>>   			continue;
>> @@ -1482,6 +1619,7 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>>   		WARN_ON(!pmd_present(pmd));
>>   		if (pmd_sect(pmd)) {
>>   			pmd_clear(pmdp);
>> +			dm_meminfo_sub(addr, PMD_SIZE, PMD);
>>   
>>   			/*
>>   			 * One TLBI should be sufficient here as the PMD_SIZE
>> @@ -1494,7 +1632,28 @@ static void unmap_hotplug_pmd_range(pud_t *pudp, unsigned long addr,
>>   			continue;
>>   		}
>>   		WARN_ON(!pmd_table(pmd));
>> -		unmap_hotplug_pte_range(pmdp, addr, next, free_mapped, altmap);
>> +		unmap_hotplug_cont_pte_range(pmdp, addr, next, free_mapped, altmap);
>> +	} while (pmdp++, addr = next, addr < end);
>> +}
>> +
>> +static void unmap_hotplug_cont_pmd_range(pud_t *pudp, unsigned long addr,
>> +					 unsigned long end, bool free_mapped,
>> +					 struct vmem_altmap *altmap)
>> +{
>> +	unsigned long next;
>> +	pmd_t *pmdp, pmd;
>> +
>> +	do {
>> +		next = pmd_cont_addr_end(addr, end);
>> +		pmdp = pmd_offset(pudp, addr);
>> +		pmd = READ_ONCE(*pmdp);
>> +
>> +		if (pmd_leaf(pmd) && pmd_cont(pmd)) {
>> +			dm_meminfo_sub(addr, CONT_PMD_SIZE, CONT_PMD);
>> +			dm_meminfo_add(addr, CONT_PMD_SIZE, PMD);
>> +		}
>> +
>> +		unmap_hotplug_pmd_range(pmdp, addr, next, free_mapped, altmap);
>>   	} while (addr = next, addr < end);
>>   }
>>   
>> @@ -1515,6 +1674,7 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
>>   		WARN_ON(!pud_present(pud));
>>   		if (pud_sect(pud)) {
>>   			pud_clear(pudp);
>> +			dm_meminfo_sub(addr, PUD_SIZE, PUD);
>>   
>>   			/*
>>   			 * One TLBI should be sufficient here as the PUD_SIZE
>> @@ -1527,7 +1687,7 @@ static void unmap_hotplug_pud_range(p4d_t *p4dp, unsigned long addr,
>>   			continue;
>>   		}
>>   		WARN_ON(!pud_table(pud));
>> -		unmap_hotplug_pmd_range(pudp, addr, next, free_mapped, altmap);
>> +		unmap_hotplug_cont_pmd_range(pudp, addr, next, free_mapped, altmap);
>>   	} while (addr = next, addr < end);
>>   }
>>

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Will Deacon 2 weeks, 1 day ago

On Thu, Jan 22, 2026 at 10:39:25AM +0530, Anshuman Khandual wrote:
> Hello Yang,
> 
> On 07/01/26 5:59 AM, Yang Shi wrote:
> > Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
> > rodata=full"), the direct mapping may be split on some machines instead
> > keeping static since boot. It makes more sense to show the direct mapping
> > use in /proc/meminfo than before.
> 
> I guess the direct mapping here refers to linear map ? IIUC it is called
> direct map on x86 and linear map on arm64 platforms. Then should not it
> be renamed as s/DirectMap/LinearMap instead ? This will align with names
> from ptdump as well.
> 
> Before the above mentioned commit, linear could get altered with memory
> hotplug and remove events as well.
> 
> > This patch will make /proc/meminfo show the direct mapping use like the
> > below (4K base page size):
> > DirectMap4K:	   94792 kB
> > DirectMap64K:	  134208 kB
> > DirectMap2M:	 1173504 kB
> > DirectMap32M:	 5636096 kB
> > DirectMap1G:	529530880 kB
> 
> If /proc/meminfo interface is getting updated via  arch_report_meminfo()
> why not add stats for all kernel virtual address space ranges including
> vmemmap, vmalloc etc aka all address range headers in ptdump as many of
> those could change during system runtime. What makes linear mapping any
> special ?

tbh, I think compatability with x86 is a good argument in this case and
so the naming and formatting proposed by this patch makes sense to me.

I'm also not sure that it's particularly interesting to see these
rolled-up numbers for the vmalloc area. You really want information
about the area, so extending /proc/vmallocinfo to give information
about the granule size for each entry might be useful but I don't think
it should be part of this patch.

Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Anshuman Khandual 2 weeks, 1 day ago


On 22/01/26 7:47 PM, Will Deacon wrote:
> On Thu, Jan 22, 2026 at 10:39:25AM +0530, Anshuman Khandual wrote:
>> Hello Yang,
>>
>> On 07/01/26 5:59 AM, Yang Shi wrote:
>>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>>> rodata=full"), the direct mapping may be split on some machines instead
>>> keeping static since boot. It makes more sense to show the direct mapping
>>> use in /proc/meminfo than before.
>>
>> I guess the direct mapping here refers to linear map ? IIUC it is called
>> direct map on x86 and linear map on arm64 platforms. Then should not it
>> be renamed as s/DirectMap/LinearMap instead ? This will align with names
>> from ptdump as well.
>>
>> Before the above mentioned commit, linear could get altered with memory
>> hotplug and remove events as well.
>>
>>> This patch will make /proc/meminfo show the direct mapping use like the
>>> below (4K base page size):
>>> DirectMap4K:	   94792 kB
>>> DirectMap64K:	  134208 kB
>>> DirectMap2M:	 1173504 kB
>>> DirectMap32M:	 5636096 kB
>>> DirectMap1G:	529530880 kB
>>
>> If /proc/meminfo interface is getting updated via  arch_report_meminfo()
>> why not add stats for all kernel virtual address space ranges including
>> vmemmap, vmalloc etc aka all address range headers in ptdump as many of
>> those could change during system runtime. What makes linear mapping any
>> special ?
> 
> tbh, I think compatability with x86 is a good argument in this case and
> so the naming and formatting proposed by this patch makes sense to me.

Fair enough. Probably adding a comment above arch_report_meminfo() along
with the commit message, explaining the above rationale would be helpful 
for developers to understand/recollect this equivalence later on.

> 
> I'm also not sure that it's particularly interesting to see these
> rolled-up numbers for the vmalloc area. You really want information
> about the area, so extending /proc/vmallocinfo to give information
> about the granule size for each entry might be useful but I don't think
> it should be part of this patch.

Agreed - vmalloc has a separate file for its details which can be improved
later to accommodate rolled-up numbers. But what about vmemmap ? It always
gets updated along with linear map during memory hotplug and remove events.
Should that be included here ?

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 2 weeks ago


On 1/22/26 6:40 PM, Anshuman Khandual wrote:
>
> On 22/01/26 7:47 PM, Will Deacon wrote:
>> On Thu, Jan 22, 2026 at 10:39:25AM +0530, Anshuman Khandual wrote:
>>> Hello Yang,
>>>
>>> On 07/01/26 5:59 AM, Yang Shi wrote:
>>>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>>>> rodata=full"), the direct mapping may be split on some machines instead
>>>> keeping static since boot. It makes more sense to show the direct mapping
>>>> use in /proc/meminfo than before.
>>> I guess the direct mapping here refers to linear map ? IIUC it is called
>>> direct map on x86 and linear map on arm64 platforms. Then should not it
>>> be renamed as s/DirectMap/LinearMap instead ? This will align with names
>>> from ptdump as well.
>>>
>>> Before the above mentioned commit, linear could get altered with memory
>>> hotplug and remove events as well.
>>>
>>>> This patch will make /proc/meminfo show the direct mapping use like the
>>>> below (4K base page size):
>>>> DirectMap4K:	   94792 kB
>>>> DirectMap64K:	  134208 kB
>>>> DirectMap2M:	 1173504 kB
>>>> DirectMap32M:	 5636096 kB
>>>> DirectMap1G:	529530880 kB
>>> If /proc/meminfo interface is getting updated via  arch_report_meminfo()
>>> why not add stats for all kernel virtual address space ranges including
>>> vmemmap, vmalloc etc aka all address range headers in ptdump as many of
>>> those could change during system runtime. What makes linear mapping any
>>> special ?
>> tbh, I think compatability with x86 is a good argument in this case and
>> so the naming and formatting proposed by this patch makes sense to me.
> Fair enough. Probably adding a comment above arch_report_meminfo() along
> with the commit message, explaining the above rationale would be helpful
> for developers to understand/recollect this equivalence later on.
>
>> I'm also not sure that it's particularly interesting to see these
>> rolled-up numbers for the vmalloc area. You really want information
>> about the area, so extending /proc/vmallocinfo to give information
>> about the granule size for each entry might be useful but I don't think
>> it should be part of this patch.
> Agreed - vmalloc has a separate file for its details which can be improved
> later to accommodate rolled-up numbers. But what about vmemmap ? It always
> gets updated along with linear map during memory hotplug and remove events.
> Should that be included here ?

The granule size of vmemmap should be quite static IIUC. AFAIK kernel 
doesn't modify vmemmap page tables except HVO, but arm64 doesn't support 
HVO. And just 4K page size can have PMD mapped vmemmap. It doesn't use 
contiguous mapping either. The smallest granularity of memory hotplug is 
128M with 4K page size, so vmemmap for hotplugged memory should be PMD 
mapped as well.

It might be worth showing the memory consumed by vmemmap for hotplug 
memory so that the users can know where the memory is used. The vmemmap 
used for bootmem is not counted in MemTotal of /proc/meminfo. But it 
sounds more core mm generic and shouldn't be part of this patch IMHO.

Thanks,
Yang

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Will Deacon 1 week, 4 days ago

On Fri, Jan 23, 2026 at 12:08:07PM -0800, Yang Shi wrote:
> 
> 
> On 1/22/26 6:40 PM, Anshuman Khandual wrote:
> > 
> > On 22/01/26 7:47 PM, Will Deacon wrote:
> > > On Thu, Jan 22, 2026 at 10:39:25AM +0530, Anshuman Khandual wrote:
> > > > Hello Yang,
> > > > 
> > > > On 07/01/26 5:59 AM, Yang Shi wrote:
> > > > > Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
> > > > > rodata=full"), the direct mapping may be split on some machines instead
> > > > > keeping static since boot. It makes more sense to show the direct mapping
> > > > > use in /proc/meminfo than before.
> > > > I guess the direct mapping here refers to linear map ? IIUC it is called
> > > > direct map on x86 and linear map on arm64 platforms. Then should not it
> > > > be renamed as s/DirectMap/LinearMap instead ? This will align with names
> > > > from ptdump as well.
> > > > 
> > > > Before the above mentioned commit, linear could get altered with memory
> > > > hotplug and remove events as well.
> > > > 
> > > > > This patch will make /proc/meminfo show the direct mapping use like the
> > > > > below (4K base page size):
> > > > > DirectMap4K:	   94792 kB
> > > > > DirectMap64K:	  134208 kB
> > > > > DirectMap2M:	 1173504 kB
> > > > > DirectMap32M:	 5636096 kB
> > > > > DirectMap1G:	529530880 kB
> > > > If /proc/meminfo interface is getting updated via  arch_report_meminfo()
> > > > why not add stats for all kernel virtual address space ranges including
> > > > vmemmap, vmalloc etc aka all address range headers in ptdump as many of
> > > > those could change during system runtime. What makes linear mapping any
> > > > special ?
> > > tbh, I think compatability with x86 is a good argument in this case and
> > > so the naming and formatting proposed by this patch makes sense to me.
> > Fair enough. Probably adding a comment above arch_report_meminfo() along
> > with the commit message, explaining the above rationale would be helpful
> > for developers to understand/recollect this equivalence later on.
> > 
> > > I'm also not sure that it's particularly interesting to see these
> > > rolled-up numbers for the vmalloc area. You really want information
> > > about the area, so extending /proc/vmallocinfo to give information
> > > about the granule size for each entry might be useful but I don't think
> > > it should be part of this patch.
> > Agreed - vmalloc has a separate file for its details which can be improved
> > later to accommodate rolled-up numbers. But what about vmemmap ? It always
> > gets updated along with linear map during memory hotplug and remove events.
> > Should that be included here ?
> 
> The granule size of vmemmap should be quite static IIUC. AFAIK kernel
> doesn't modify vmemmap page tables except HVO, but arm64 doesn't support
> HVO. And just 4K page size can have PMD mapped vmemmap. It doesn't use
> contiguous mapping either. The smallest granularity of memory hotplug is
> 128M with 4K page size, so vmemmap for hotplugged memory should be PMD
> mapped as well.
> 
> It might be worth showing the memory consumed by vmemmap for hotplug memory
> so that the users can know where the memory is used. The vmemmap used for
> bootmem is not counted in MemTotal of /proc/meminfo. But it sounds more core
> mm generic and shouldn't be part of this patch IMHO.

Agreed, I don't understand what the granularity of the vmemmap has to do
with this.

Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Will Deacon 3 weeks, 3 days ago

On Tue, Jan 06, 2026 at 04:29:44PM -0800, Yang Shi wrote:
> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
> rodata=full"), the direct mapping may be split on some machines instead
> keeping static since boot. It makes more sense to show the direct mapping
> use in /proc/meminfo than before.
> This patch will make /proc/meminfo show the direct mapping use like the
> below (4K base page size):
> DirectMap4K:	   94792 kB
> DirectMap64K:	  134208 kB
> DirectMap2M:	 1173504 kB
> DirectMap32M:	 5636096 kB
> DirectMap1G:	529530880 kB
> 
> Although just the machines which support BBML2_NOABORT can split the
> direct mapping, show it on all machines regardless of BBML2_NOABORT so
> that the users have consistent view in order to avoid confusion.
> 
> Although ptdump also can tell the direct map use, but it needs to dump
> the whole kernel page table. It is costly and overkilling. It is also
> in debugfs which may not be enabled by all distros. So showing direct
> map use in /proc/meminfo seems more convenient and has less overhead.
> 
> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
> ---
> v5: * Rebased to v6.19-rc4
>     * Fixed the build error for !CONFIG_PROC_FS
> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>     * Used shorter name for the helpers and variables per Ryan
>     * Fixed accounting for memory hotunplug
> v3: * Fixed the over-accounting problems per Ryan
>     * Introduced helpers for add/sub direct map use and #ifdef them with
>       CONFIG_PROC_FS per Ryan
>     * v3 is a fix patch on top of v2
> v2: * Counted in size instead of the number of entries per Ryan
>     * Removed shift array per Ryan
>     * Use lower case "k" per Ryan
>     * Fixed a couple of build warnings reported by kernel test robot
>     * Fixed a couple of poential miscounts
> 
>  arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 181 insertions(+), 21 deletions(-)
> 
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 8e1d80a7033e..422441c9a992 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -29,6 +29,7 @@
>  #include <linux/mm_inline.h>
>  #include <linux/pagewalk.h>
>  #include <linux/stop_machine.h>
> +#include <linux/proc_fs.h>
>  
>  #include <asm/barrier.h>
>  #include <asm/cputype.h>
> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>  	dsb(ishst);
>  }
>  
> +enum dm_type {
> +	PTE,
> +	CONT_PTE,
> +	PMD,
> +	CONT_PMD,
> +	PUD,
> +	NR_DM_TYPE,
> +};
> +
> +#ifdef CONFIG_PROC_FS
> +static unsigned long dm_meminfo[NR_DM_TYPE];
> +
> +void arch_report_meminfo(struct seq_file *m)
> +{
> +	char *size[NR_DM_TYPE];

const?

> +
> +#if defined(CONFIG_ARM64_4K_PAGES)
> +	size[PTE] = "4k";
> +	size[CONT_PTE] = "64k";
> +	size[PMD] = "2M";
> +	size[CONT_PMD] = "32M";
> +	size[PUD] = "1G";
> +#elif defined(CONFIG_ARM64_16K_PAGES)
> +	size[PTE] = "16k";
> +	size[CONT_PTE] = "2M";
> +	size[PMD] = "32M";
> +	size[CONT_PMD] = "1G";
> +#elif defined(CONFIG_ARM64_64K_PAGES)
> +	size[PTE] = "64k";
> +	size[CONT_PTE] = "2M";
> +	size[PMD] = "512M";
> +	size[CONT_PMD] = "16G";
> +#endif
> +
> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> +			size[PTE], dm_meminfo[PTE] >> 10);
> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> +			size[CONT_PTE],
> +			dm_meminfo[CONT_PTE] >> 10);
> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> +			size[PMD], dm_meminfo[PMD] >> 10);
> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> +			size[CONT_PMD],
> +			dm_meminfo[CONT_PMD] >> 10);
> +	if (pud_sect_supported())
> +		seq_printf(m, "DirectMap%s:	%8lu kB\n",
> +			size[PUD], dm_meminfo[PUD] >> 10);

This seems a bit brittle to me. If somebody adds support for l1 block
mappings for !4k pages in future, they will forget to update this and
we'll end up returning kernel stack in /proc/meminfo afaict.

> +static inline bool is_dm_addr(unsigned long addr)
> +{
> +	return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
> +}
> +
> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
> +				  enum dm_type type)
> +{
> +	if (is_dm_addr(addr))
> +		dm_meminfo[type] += size;
> +}
> +
> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
> +				  enum dm_type type)
> +{
> +	if (is_dm_addr(addr))
> +		dm_meminfo[type] -= size;
> +}
> +#else
> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
> +				  enum dm_type type)
> +{
> +}
> +
> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
> +				  enum dm_type type)
> +{
> +}
> +#endif
> +
>  static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>  		     phys_addr_t phys, pgprot_t prot)
>  {
> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>  
>  		init_pte(ptep, addr, next, phys, __prot);
>  
> +		if (pgprot_val(__prot) & PTE_CONT)
> +			dm_meminfo_add(addr, (next - addr), CONT_PTE);
> +		else
> +			dm_meminfo_add(addr, (next - addr), PTE);
> +
>  		ptep += pte_index(next) - pte_index(addr);
>  		phys += next - addr;
>  	} while (addr = next, addr != end);
> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>  		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>  			pmd_set_huge(pmdp, phys, prot);
>  
> +			/*
> +			 * It is possible to have mappings allow cont mapping
> +			 * but disallow block mapping. For example,
> +			 * map_entry_trampoline().
> +			 * So we have to increase CONT_PMD and PMD size here
> +			 * to avoid double counting.
> +			 */
> +			if (pgprot_val(prot) & PTE_CONT)
> +				dm_meminfo_add(addr, (next - addr), CONT_PMD);
> +			else
> +				dm_meminfo_add(addr, (next - addr), PMD);

I don't understand the comment you're adding here. If somebody passes
NO_BLOCK_MAPPINGS then that also prevents contiguous entries except at
level 3.

It also doesn't look you handle the error case properly when the mapping
fails.

> -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>  				    unsigned long end, bool free_mapped,
>  				    struct vmem_altmap *altmap)
>  {
> -	pte_t *ptep, pte;
> +	pte_t pte;
>  
>  	do {
> -		ptep = pte_offset_kernel(pmdp, addr);
>  		pte = __ptep_get(ptep);
>  		if (pte_none(pte))
>  			continue;
>  
>  		WARN_ON(!pte_present(pte));
>  		__pte_clear(&init_mm, addr, ptep);
> +		dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>  		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>  		if (free_mapped)
>  			free_hotplug_page_range(pte_page(pte),
>  						PAGE_SIZE, altmap);

Is the existing code correct for contiguous entries here? I'd have
thought that we'd need to make the range non-contiguous before knocking
out the TLB.

Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Ryan Roberts 2 weeks, 2 days ago

On 13/01/2026 14:36, Will Deacon wrote:
> On Tue, Jan 06, 2026 at 04:29:44PM -0800, Yang Shi wrote:
>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>> rodata=full"), the direct mapping may be split on some machines instead
>> keeping static since boot. It makes more sense to show the direct mapping
>> use in /proc/meminfo than before.
>> This patch will make /proc/meminfo show the direct mapping use like the
>> below (4K base page size):
>> DirectMap4K:	   94792 kB
>> DirectMap64K:	  134208 kB
>> DirectMap2M:	 1173504 kB
>> DirectMap32M:	 5636096 kB
>> DirectMap1G:	529530880 kB
>>
>> Although just the machines which support BBML2_NOABORT can split the
>> direct mapping, show it on all machines regardless of BBML2_NOABORT so
>> that the users have consistent view in order to avoid confusion.
>>
>> Although ptdump also can tell the direct map use, but it needs to dump
>> the whole kernel page table. It is costly and overkilling. It is also
>> in debugfs which may not be enabled by all distros. So showing direct
>> map use in /proc/meminfo seems more convenient and has less overhead.
>>
>> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
>> ---
>> v5: * Rebased to v6.19-rc4
>>     * Fixed the build error for !CONFIG_PROC_FS
>> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>>     * Used shorter name for the helpers and variables per Ryan
>>     * Fixed accounting for memory hotunplug
>> v3: * Fixed the over-accounting problems per Ryan
>>     * Introduced helpers for add/sub direct map use and #ifdef them with
>>       CONFIG_PROC_FS per Ryan
>>     * v3 is a fix patch on top of v2
>> v2: * Counted in size instead of the number of entries per Ryan
>>     * Removed shift array per Ryan
>>     * Use lower case "k" per Ryan
>>     * Fixed a couple of build warnings reported by kernel test robot
>>     * Fixed a couple of poential miscounts
>>
>>  arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 181 insertions(+), 21 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 8e1d80a7033e..422441c9a992 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -29,6 +29,7 @@
>>  #include <linux/mm_inline.h>
>>  #include <linux/pagewalk.h>
>>  #include <linux/stop_machine.h>
>> +#include <linux/proc_fs.h>
>>  
>>  #include <asm/barrier.h>
>>  #include <asm/cputype.h>
>> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>>  	dsb(ishst);
>>  }
>>  
>> +enum dm_type {
>> +	PTE,
>> +	CONT_PTE,
>> +	PMD,
>> +	CONT_PMD,
>> +	PUD,
>> +	NR_DM_TYPE,
>> +};
>> +
>> +#ifdef CONFIG_PROC_FS
>> +static unsigned long dm_meminfo[NR_DM_TYPE];
>> +
>> +void arch_report_meminfo(struct seq_file *m)
>> +{
>> +	char *size[NR_DM_TYPE];
> 
> const?
> 
>> +
>> +#if defined(CONFIG_ARM64_4K_PAGES)
>> +	size[PTE] = "4k";
>> +	size[CONT_PTE] = "64k";
>> +	size[PMD] = "2M";
>> +	size[CONT_PMD] = "32M";
>> +	size[PUD] = "1G";
>> +#elif defined(CONFIG_ARM64_16K_PAGES)
>> +	size[PTE] = "16k";
>> +	size[CONT_PTE] = "2M";
>> +	size[PMD] = "32M";
>> +	size[CONT_PMD] = "1G";
>> +#elif defined(CONFIG_ARM64_64K_PAGES)
>> +	size[PTE] = "64k";
>> +	size[CONT_PTE] = "2M";
>> +	size[PMD] = "512M";
>> +	size[CONT_PMD] = "16G";
>> +#endif
>> +
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PTE], dm_meminfo[PTE] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[CONT_PTE],
>> +			dm_meminfo[CONT_PTE] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PMD], dm_meminfo[PMD] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[CONT_PMD],
>> +			dm_meminfo[CONT_PMD] >> 10);
>> +	if (pud_sect_supported())
>> +		seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PUD], dm_meminfo[PUD] >> 10);
> 
> This seems a bit brittle to me. If somebody adds support for l1 block
> mappings for !4k pages in future, they will forget to update this and
> we'll end up returning kernel stack in /proc/meminfo afaict.
> 
>> +static inline bool is_dm_addr(unsigned long addr)
>> +{
>> +	return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
>> +}
>> +
>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +	if (is_dm_addr(addr))
>> +		dm_meminfo[type] += size;
>> +}
>> +
>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +	if (is_dm_addr(addr))
>> +		dm_meminfo[type] -= size;
>> +}
>> +#else
>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +}
>> +
>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +}
>> +#endif
>> +
>>  static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>>  		     phys_addr_t phys, pgprot_t prot)
>>  {
>> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>>  
>>  		init_pte(ptep, addr, next, phys, __prot);
>>  
>> +		if (pgprot_val(__prot) & PTE_CONT)
>> +			dm_meminfo_add(addr, (next - addr), CONT_PTE);
>> +		else
>> +			dm_meminfo_add(addr, (next - addr), PTE);
>> +
>>  		ptep += pte_index(next) - pte_index(addr);
>>  		phys += next - addr;
>>  	} while (addr = next, addr != end);
>> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>>  		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>>  			pmd_set_huge(pmdp, phys, prot);
>>  
>> +			/*
>> +			 * It is possible to have mappings allow cont mapping
>> +			 * but disallow block mapping. For example,
>> +			 * map_entry_trampoline().
>> +			 * So we have to increase CONT_PMD and PMD size here
>> +			 * to avoid double counting.
>> +			 */
>> +			if (pgprot_val(prot) & PTE_CONT)
>> +				dm_meminfo_add(addr, (next - addr), CONT_PMD);
>> +			else
>> +				dm_meminfo_add(addr, (next - addr), PMD);
> 
> I don't understand the comment you're adding here. If somebody passes
> NO_BLOCK_MAPPINGS then that also prevents contiguous entries except at
> level 3.
> 
> It also doesn't look you handle the error case properly when the mapping
> fails.
> 
>> -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>>  				    unsigned long end, bool free_mapped,
>>  				    struct vmem_altmap *altmap)
>>  {
>> -	pte_t *ptep, pte;
>> +	pte_t pte;
>>  
>>  	do {
>> -		ptep = pte_offset_kernel(pmdp, addr);
>>  		pte = __ptep_get(ptep);
>>  		if (pte_none(pte))
>>  			continue;
>>  
>>  		WARN_ON(!pte_present(pte));
>>  		__pte_clear(&init_mm, addr, ptep);
>> +		dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>>  		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>  		if (free_mapped)
>>  			free_hotplug_page_range(pte_page(pte),
>>  						PAGE_SIZE, altmap);
> 
> Is the existing code correct for contiguous entries here? I'd have
> thought that we'd need to make the range non-contiguous before knocking
> out the TLB.

The Arm ARM has this, which makes me think you are probably correct:

IVNXYF:
The architecture does not require descriptors with the Contiguous bit set to 1
to be cached as a single TLB entry for the contiguous region. To avoid TLB
coherency issues, software is required to perform TLB maintenance on the entire
address region that results from using the Contiguous bit.

I've asked for clarification internally. But I think we should hoist out the tlb
flush regardless because it will be faster if we just invalidate a single range.
I can handle that as a separate patch if you like.


However, I think there may be another problem; IIUC, any old range of memory can
be hot-unplugged as long as it is section aligned. It doesn't have to be the
same range that was previously hot-plugged. But if the linear map is block
mapped, the range being unplugged may cover a partial block mapping.

For example, with 4K pages, the section size is 128M, so you could hot unmap
128M from a PUD leaf mapping (1G). What am I missing that means this doesn't go
bang?

This would have been an issue for the non-rodata-full config so predates the
work to split the linear map dynamically. I'm not really sure how to solve this
for systems without BBML2 but without non--rodata-full.

I must be misunderstanding something crucial here... I'll dig some more.

Thanks,
Ryan



> 
> Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 2 weeks, 2 days ago


On 1/21/26 9:23 AM, Ryan Roberts wrote:
> On 13/01/2026 14:36, Will Deacon wrote:
>> On Tue, Jan 06, 2026 at 04:29:44PM -0800, Yang Shi wrote:
>>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>>> rodata=full"), the direct mapping may be split on some machines instead
>>> keeping static since boot. It makes more sense to show the direct mapping
>>> use in /proc/meminfo than before.
>>> This patch will make /proc/meminfo show the direct mapping use like the
>>> below (4K base page size):
>>> DirectMap4K:	   94792 kB
>>> DirectMap64K:	  134208 kB
>>> DirectMap2M:	 1173504 kB
>>> DirectMap32M:	 5636096 kB
>>> DirectMap1G:	529530880 kB
>>>
>>> Although just the machines which support BBML2_NOABORT can split the
>>> direct mapping, show it on all machines regardless of BBML2_NOABORT so
>>> that the users have consistent view in order to avoid confusion.
>>>
>>> Although ptdump also can tell the direct map use, but it needs to dump
>>> the whole kernel page table. It is costly and overkilling. It is also
>>> in debugfs which may not be enabled by all distros. So showing direct
>>> map use in /proc/meminfo seems more convenient and has less overhead.
>>>
>>> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
>>> ---
>>> v5: * Rebased to v6.19-rc4
>>>      * Fixed the build error for !CONFIG_PROC_FS
>>> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>>>      * Used shorter name for the helpers and variables per Ryan
>>>      * Fixed accounting for memory hotunplug
>>> v3: * Fixed the over-accounting problems per Ryan
>>>      * Introduced helpers for add/sub direct map use and #ifdef them with
>>>        CONFIG_PROC_FS per Ryan
>>>      * v3 is a fix patch on top of v2
>>> v2: * Counted in size instead of the number of entries per Ryan
>>>      * Removed shift array per Ryan
>>>      * Use lower case "k" per Ryan
>>>      * Fixed a couple of build warnings reported by kernel test robot
>>>      * Fixed a couple of poential miscounts
>>>
>>>   arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++++++-----
>>>   1 file changed, 181 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>>> index 8e1d80a7033e..422441c9a992 100644
>>> --- a/arch/arm64/mm/mmu.c
>>> +++ b/arch/arm64/mm/mmu.c
>>> @@ -29,6 +29,7 @@
>>>   #include <linux/mm_inline.h>
>>>   #include <linux/pagewalk.h>
>>>   #include <linux/stop_machine.h>
>>> +#include <linux/proc_fs.h>
>>>   
>>>   #include <asm/barrier.h>
>>>   #include <asm/cputype.h>
>>> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>>>   	dsb(ishst);
>>>   }
>>>   
>>> +enum dm_type {
>>> +	PTE,
>>> +	CONT_PTE,
>>> +	PMD,
>>> +	CONT_PMD,
>>> +	PUD,
>>> +	NR_DM_TYPE,
>>> +};
>>> +
>>> +#ifdef CONFIG_PROC_FS
>>> +static unsigned long dm_meminfo[NR_DM_TYPE];
>>> +
>>> +void arch_report_meminfo(struct seq_file *m)
>>> +{
>>> +	char *size[NR_DM_TYPE];
>> const?
>>
>>> +
>>> +#if defined(CONFIG_ARM64_4K_PAGES)
>>> +	size[PTE] = "4k";
>>> +	size[CONT_PTE] = "64k";
>>> +	size[PMD] = "2M";
>>> +	size[CONT_PMD] = "32M";
>>> +	size[PUD] = "1G";
>>> +#elif defined(CONFIG_ARM64_16K_PAGES)
>>> +	size[PTE] = "16k";
>>> +	size[CONT_PTE] = "2M";
>>> +	size[PMD] = "32M";
>>> +	size[CONT_PMD] = "1G";
>>> +#elif defined(CONFIG_ARM64_64K_PAGES)
>>> +	size[PTE] = "64k";
>>> +	size[CONT_PTE] = "2M";
>>> +	size[PMD] = "512M";
>>> +	size[CONT_PMD] = "16G";
>>> +#endif
>>> +
>>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>> +			size[PTE], dm_meminfo[PTE] >> 10);
>>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>> +			size[CONT_PTE],
>>> +			dm_meminfo[CONT_PTE] >> 10);
>>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>> +			size[PMD], dm_meminfo[PMD] >> 10);
>>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>> +			size[CONT_PMD],
>>> +			dm_meminfo[CONT_PMD] >> 10);
>>> +	if (pud_sect_supported())
>>> +		seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>> +			size[PUD], dm_meminfo[PUD] >> 10);
>> This seems a bit brittle to me. If somebody adds support for l1 block
>> mappings for !4k pages in future, they will forget to update this and
>> we'll end up returning kernel stack in /proc/meminfo afaict.
>>
>>> +static inline bool is_dm_addr(unsigned long addr)
>>> +{
>>> +	return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
>>> +}
>>> +
>>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>>> +				  enum dm_type type)
>>> +{
>>> +	if (is_dm_addr(addr))
>>> +		dm_meminfo[type] += size;
>>> +}
>>> +
>>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>>> +				  enum dm_type type)
>>> +{
>>> +	if (is_dm_addr(addr))
>>> +		dm_meminfo[type] -= size;
>>> +}
>>> +#else
>>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>>> +				  enum dm_type type)
>>> +{
>>> +}
>>> +
>>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>>> +				  enum dm_type type)
>>> +{
>>> +}
>>> +#endif
>>> +
>>>   static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>>>   		     phys_addr_t phys, pgprot_t prot)
>>>   {
>>> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>>>   
>>>   		init_pte(ptep, addr, next, phys, __prot);
>>>   
>>> +		if (pgprot_val(__prot) & PTE_CONT)
>>> +			dm_meminfo_add(addr, (next - addr), CONT_PTE);
>>> +		else
>>> +			dm_meminfo_add(addr, (next - addr), PTE);
>>> +
>>>   		ptep += pte_index(next) - pte_index(addr);
>>>   		phys += next - addr;
>>>   	} while (addr = next, addr != end);
>>> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>>>   		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>>>   			pmd_set_huge(pmdp, phys, prot);
>>>   
>>> +			/*
>>> +			 * It is possible to have mappings allow cont mapping
>>> +			 * but disallow block mapping. For example,
>>> +			 * map_entry_trampoline().
>>> +			 * So we have to increase CONT_PMD and PMD size here
>>> +			 * to avoid double counting.
>>> +			 */
>>> +			if (pgprot_val(prot) & PTE_CONT)
>>> +				dm_meminfo_add(addr, (next - addr), CONT_PMD);
>>> +			else
>>> +				dm_meminfo_add(addr, (next - addr), PMD);
>> I don't understand the comment you're adding here. If somebody passes
>> NO_BLOCK_MAPPINGS then that also prevents contiguous entries except at
>> level 3.
>>
>> It also doesn't look you handle the error case properly when the mapping
>> fails.
>>
>>> -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>>> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>>>   				    unsigned long end, bool free_mapped,
>>>   				    struct vmem_altmap *altmap)
>>>   {
>>> -	pte_t *ptep, pte;
>>> +	pte_t pte;
>>>   
>>>   	do {
>>> -		ptep = pte_offset_kernel(pmdp, addr);
>>>   		pte = __ptep_get(ptep);
>>>   		if (pte_none(pte))
>>>   			continue;
>>>   
>>>   		WARN_ON(!pte_present(pte));
>>>   		__pte_clear(&init_mm, addr, ptep);
>>> +		dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>>>   		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>>   		if (free_mapped)
>>>   			free_hotplug_page_range(pte_page(pte),
>>>   						PAGE_SIZE, altmap);
>> Is the existing code correct for contiguous entries here? I'd have
>> thought that we'd need to make the range non-contiguous before knocking
>> out the TLB.
> The Arm ARM has this, which makes me think you are probably correct:
>
> IVNXYF:
> The architecture does not require descriptors with the Contiguous bit set to 1
> to be cached as a single TLB entry for the contiguous region. To avoid TLB
> coherency issues, software is required to perform TLB maintenance on the entire
> address region that results from using the Contiguous bit.
>
> I've asked for clarification internally. But I think we should hoist out the tlb
> flush regardless because it will be faster if we just invalidate a single range.
> I can handle that as a separate patch if you like.

Thanks, Ryan.

>
>
> However, I think there may be another problem; IIUC, any old range of memory can
> be hot-unplugged as long as it is section aligned. It doesn't have to be the
> same range that was previously hot-plugged. But if the linear map is block
> mapped, the range being unplugged may cover a partial block mapping.
>
> For example, with 4K pages, the section size is 128M, so you could hot unmap
> 128M from a PUD leaf mapping (1G). What am I missing that means this doesn't go
> bang?
>
> This would have been an issue for the non-rodata-full config so predates the
> work to split the linear map dynamically. I'm not really sure how to solve this
> for systems without BBML2 but without non--rodata-full.
>
> I must be misunderstanding something crucial here... I'll dig some more.

I'm not expert on memory hotplug. I'm not 100% confident my 
understanding is correct. But I noticed something that I misunderstood 
before, when I was testing the patch.

The hotunplug actually has two stages: offline and unplug (physically 
remove the device).

When we echo offline to the sysfs file, it actually just does offline. 
The offline just isolates the memory from buddy, but it does *NOT* unmap 
the memory from the linear mapping. The linear mapping will be unmapped 
at unplug stage. I tested the patch with QEMU, I just can emulate 
hotplug/hotunplug the whole dimm, for example, hotplug 1G, then 
hotunplug the same 1G. I can't emulate hotunplug in the smaller size. I 
thought it may be the limitation of QEMU in the first place. But I 
realized we can't hotunplug a part of dimm physically either, right? For 
example, we insert 1G dimm to the board, we can't take out 128M from the 
dimm physically. So IIUC the partial unmap of linear mapping should 
never happen.

If I read the code correctly, the code does unmap the linear mapping on 
memory block granularity. The block linear mapping is unmapped when 
removing the first block if it covers multiple memory blocks. Then the 
page table will be none when removing the later blocks, but it is ok, 
the code just continue.

Thanks,
Yang

>
> Thanks,
> Ryan
>
>
>
>> Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Ryan Roberts 2 weeks, 1 day ago

On 21/01/2026 22:44, Yang Shi wrote:
> 
> 
> On 1/21/26 9:23 AM, Ryan Roberts wrote:
>> On 13/01/2026 14:36, Will Deacon wrote:
>>> On Tue, Jan 06, 2026 at 04:29:44PM -0800, Yang Shi wrote:
>>>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>>>> rodata=full"), the direct mapping may be split on some machines instead
>>>> keeping static since boot. It makes more sense to show the direct mapping
>>>> use in /proc/meminfo than before.
>>>> This patch will make /proc/meminfo show the direct mapping use like the
>>>> below (4K base page size):
>>>> DirectMap4K:       94792 kB
>>>> DirectMap64K:      134208 kB
>>>> DirectMap2M:     1173504 kB
>>>> DirectMap32M:     5636096 kB
>>>> DirectMap1G:    529530880 kB
>>>>
>>>> Although just the machines which support BBML2_NOABORT can split the
>>>> direct mapping, show it on all machines regardless of BBML2_NOABORT so
>>>> that the users have consistent view in order to avoid confusion.
>>>>
>>>> Although ptdump also can tell the direct map use, but it needs to dump
>>>> the whole kernel page table. It is costly and overkilling. It is also
>>>> in debugfs which may not be enabled by all distros. So showing direct
>>>> map use in /proc/meminfo seems more convenient and has less overhead.
>>>>
>>>> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
>>>> ---
>>>> v5: * Rebased to v6.19-rc4
>>>>      * Fixed the build error for !CONFIG_PROC_FS
>>>> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>>>>      * Used shorter name for the helpers and variables per Ryan
>>>>      * Fixed accounting for memory hotunplug
>>>> v3: * Fixed the over-accounting problems per Ryan
>>>>      * Introduced helpers for add/sub direct map use and #ifdef them with
>>>>        CONFIG_PROC_FS per Ryan
>>>>      * v3 is a fix patch on top of v2
>>>> v2: * Counted in size instead of the number of entries per Ryan
>>>>      * Removed shift array per Ryan
>>>>      * Use lower case "k" per Ryan
>>>>      * Fixed a couple of build warnings reported by kernel test robot
>>>>      * Fixed a couple of poential miscounts
>>>>
>>>>   arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++++++-----
>>>>   1 file changed, 181 insertions(+), 21 deletions(-)
>>>>
>>>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>>>> index 8e1d80a7033e..422441c9a992 100644
>>>> --- a/arch/arm64/mm/mmu.c
>>>> +++ b/arch/arm64/mm/mmu.c
>>>> @@ -29,6 +29,7 @@
>>>>   #include <linux/mm_inline.h>
>>>>   #include <linux/pagewalk.h>
>>>>   #include <linux/stop_machine.h>
>>>> +#include <linux/proc_fs.h>
>>>>     #include <asm/barrier.h>
>>>>   #include <asm/cputype.h>
>>>> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>>>>       dsb(ishst);
>>>>   }
>>>>   +enum dm_type {
>>>> +    PTE,
>>>> +    CONT_PTE,
>>>> +    PMD,
>>>> +    CONT_PMD,
>>>> +    PUD,
>>>> +    NR_DM_TYPE,
>>>> +};
>>>> +
>>>> +#ifdef CONFIG_PROC_FS
>>>> +static unsigned long dm_meminfo[NR_DM_TYPE];
>>>> +
>>>> +void arch_report_meminfo(struct seq_file *m)
>>>> +{
>>>> +    char *size[NR_DM_TYPE];
>>> const?
>>>
>>>> +
>>>> +#if defined(CONFIG_ARM64_4K_PAGES)
>>>> +    size[PTE] = "4k";
>>>> +    size[CONT_PTE] = "64k";
>>>> +    size[PMD] = "2M";
>>>> +    size[CONT_PMD] = "32M";
>>>> +    size[PUD] = "1G";
>>>> +#elif defined(CONFIG_ARM64_16K_PAGES)
>>>> +    size[PTE] = "16k";
>>>> +    size[CONT_PTE] = "2M";
>>>> +    size[PMD] = "32M";
>>>> +    size[CONT_PMD] = "1G";
>>>> +#elif defined(CONFIG_ARM64_64K_PAGES)
>>>> +    size[PTE] = "64k";
>>>> +    size[CONT_PTE] = "2M";
>>>> +    size[PMD] = "512M";
>>>> +    size[CONT_PMD] = "16G";
>>>> +#endif
>>>> +
>>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>>> +            size[PTE], dm_meminfo[PTE] >> 10);
>>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>>> +            size[CONT_PTE],
>>>> +            dm_meminfo[CONT_PTE] >> 10);
>>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>>> +            size[PMD], dm_meminfo[PMD] >> 10);
>>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>>> +            size[CONT_PMD],
>>>> +            dm_meminfo[CONT_PMD] >> 10);
>>>> +    if (pud_sect_supported())
>>>> +        seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>>> +            size[PUD], dm_meminfo[PUD] >> 10);
>>> This seems a bit brittle to me. If somebody adds support for l1 block
>>> mappings for !4k pages in future, they will forget to update this and
>>> we'll end up returning kernel stack in /proc/meminfo afaict.
>>>
>>>> +static inline bool is_dm_addr(unsigned long addr)
>>>> +{
>>>> +    return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
>>>> +}
>>>> +
>>>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>>>> +                  enum dm_type type)
>>>> +{
>>>> +    if (is_dm_addr(addr))
>>>> +        dm_meminfo[type] += size;
>>>> +}
>>>> +
>>>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>>>> +                  enum dm_type type)
>>>> +{
>>>> +    if (is_dm_addr(addr))
>>>> +        dm_meminfo[type] -= size;
>>>> +}
>>>> +#else
>>>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>>>> +                  enum dm_type type)
>>>> +{
>>>> +}
>>>> +
>>>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>>>> +                  enum dm_type type)
>>>> +{
>>>> +}
>>>> +#endif
>>>> +
>>>>   static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>>>>                phys_addr_t phys, pgprot_t prot)
>>>>   {
>>>> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned
>>>> long addr,
>>>>             init_pte(ptep, addr, next, phys, __prot);
>>>>   +        if (pgprot_val(__prot) & PTE_CONT)
>>>> +            dm_meminfo_add(addr, (next - addr), CONT_PTE);
>>>> +        else
>>>> +            dm_meminfo_add(addr, (next - addr), PTE);
>>>> +
>>>>           ptep += pte_index(next) - pte_index(addr);
>>>>           phys += next - addr;
>>>>       } while (addr = next, addr != end);
>>>> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr,
>>>> unsigned long end,
>>>>               (flags & NO_BLOCK_MAPPINGS) == 0) {
>>>>               pmd_set_huge(pmdp, phys, prot);
>>>>   +            /*
>>>> +             * It is possible to have mappings allow cont mapping
>>>> +             * but disallow block mapping. For example,
>>>> +             * map_entry_trampoline().
>>>> +             * So we have to increase CONT_PMD and PMD size here
>>>> +             * to avoid double counting.
>>>> +             */
>>>> +            if (pgprot_val(prot) & PTE_CONT)
>>>> +                dm_meminfo_add(addr, (next - addr), CONT_PMD);
>>>> +            else
>>>> +                dm_meminfo_add(addr, (next - addr), PMD);
>>> I don't understand the comment you're adding here. If somebody passes
>>> NO_BLOCK_MAPPINGS then that also prevents contiguous entries except at
>>> level 3.
>>>
>>> It also doesn't look you handle the error case properly when the mapping
>>> fails.
>>>
>>>> -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>>>> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>>>>                       unsigned long end, bool free_mapped,
>>>>                       struct vmem_altmap *altmap)
>>>>   {
>>>> -    pte_t *ptep, pte;
>>>> +    pte_t pte;
>>>>         do {
>>>> -        ptep = pte_offset_kernel(pmdp, addr);
>>>>           pte = __ptep_get(ptep);
>>>>           if (pte_none(pte))
>>>>               continue;
>>>>             WARN_ON(!pte_present(pte));
>>>>           __pte_clear(&init_mm, addr, ptep);
>>>> +        dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>>>>           flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>>>           if (free_mapped)
>>>>               free_hotplug_page_range(pte_page(pte),
>>>>                           PAGE_SIZE, altmap);
>>> Is the existing code correct for contiguous entries here? I'd have
>>> thought that we'd need to make the range non-contiguous before knocking
>>> out the TLB.
>> The Arm ARM has this, which makes me think you are probably correct:
>>
>> IVNXYF:
>> The architecture does not require descriptors with the Contiguous bit set to 1
>> to be cached as a single TLB entry for the contiguous region. To avoid TLB
>> coherency issues, software is required to perform TLB maintenance on the entire
>> address region that results from using the Contiguous bit.
>>
>> I've asked for clarification internally. But I think we should hoist out the tlb
>> flush regardless because it will be faster if we just invalidate a single range.
>> I can handle that as a separate patch if you like.
> 
> Thanks, Ryan.

Of course it's not quite as simple as hoisting for the vmemmap unmapping case
since that is also freeing the memory, so we need to issue the tlbi before freeing.

vmemmap never uses contiguous mappings so could continue with the current
strategy for that and only hoist the tlbi-range for the unmapping the linear map
case.

Or could do a 2 phase approach for vmemmap where we first set VALID=0 for all
entries, then flush tlb, then walk again, to free clear the pte and free the
pointed to page.

Other ideas welcome... I'll have a play.

> 
>>
>>
>> However, I think there may be another problem; IIUC, any old range of memory can
>> be hot-unplugged as long as it is section aligned. It doesn't have to be the
>> same range that was previously hot-plugged. But if the linear map is block
>> mapped, the range being unplugged may cover a partial block mapping.
>>
>> For example, with 4K pages, the section size is 128M, so you could hot unmap
>> 128M from a PUD leaf mapping (1G). What am I missing that means this doesn't go
>> bang?
>>
>> This would have been an issue for the non-rodata-full config so predates the
>> work to split the linear map dynamically. I'm not really sure how to solve this
>> for systems without BBML2 but without non--rodata-full.
>>
>> I must be misunderstanding something crucial here... I'll dig some more.
> 
> I'm not expert on memory hotplug. I'm not 100% confident my understanding is
> correct. But I noticed something that I misunderstood before, when I was testing
> the patch.
> 
> The hotunplug actually has two stages: offline and unplug (physically remove the
> device).
> 
> When we echo offline to the sysfs file, it actually just does offline. The
> offline just isolates the memory from buddy, but it does *NOT* unmap the memory
> from the linear mapping. The linear mapping will be unmapped at unplug stage. I
> tested the patch with QEMU, I just can emulate hotplug/hotunplug the whole dimm,
> for example, hotplug 1G, then hotunplug the same 1G. I can't emulate hotunplug
> in the smaller size. I thought it may be the limitation of QEMU in the first
> place. But I realized we can't hotunplug a part of dimm physically either,
> right? For example, we insert 1G dimm to the board, we can't take out 128M from
> the dimm physically. So IIUC the partial unmap of linear mapping should never
> happen.

Looking at the code, it looks to me like memory_hotplug.c doesn't care and will
try to unplug any span of memory that it is asked to, as long as start and end
are aligned to memory_block_size_bytes() (which for arm64 is section size = 128M
for 4K base pages).

But it looks like all the higher level users will only ever unplug in the same
granularity that was plugged in (I might be wrong but that's the sense I get).

arm64 adds the constraint that it won't unplug any memory that was present at
boot - see prevent_bootmem_remove_notifier().

So in practice this is probably safe, though perhaps brittle.

Some options:

 - leave it as is and worry about it if/when something shifts and hits the
   problem.

 - Enhance prevent_bootmem_remove_notifier() to reject unplugging memory blocks
   whose boundaries are within leaf mappings.

 - For non-bbml2_noabort systems, map hotplug memory with a new flag to ensure
   that leaf mappings are always <= memory_block_size_bytes(). For
   bbml2_noabort, split at the block boundaries before doing the unmapping.

Given I don't think this can happen in practice, probably the middle option is
the best? There is no runtime impact and it will give us a warning if it ever
does happen in future.

What do you think?

Thanks,
Ryan

> 
> If I read the code correctly, the code does unmap the linear mapping on memory
> block granularity. The block linear mapping is unmapped when removing the first
> block if it covers multiple memory blocks. Then the page table will be none when
> removing the later blocks, but it is ok, the code just continue.
> 
> Thanks,
> Yang
> 
>>
>> Thanks,
>> Ryan
>>
>>
>>
>>> Will
>

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 2 weeks, 1 day ago


On 1/22/26 6:43 AM, Ryan Roberts wrote:
> On 21/01/2026 22:44, Yang Shi wrote:
>> On 1/21/26 9:23 AM, Ryan Roberts wrote:
>>> On 13/01/2026 14:36, Will Deacon wrote:
>>>> On Tue, Jan 06, 2026 at 04:29:44PM -0800, Yang Shi wrote:
>>>>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>>>>> rodata=full"), the direct mapping may be split on some machines instead
>>>>> keeping static since boot. It makes more sense to show the direct mapping
>>>>> use in /proc/meminfo than before.
>>>>> This patch will make /proc/meminfo show the direct mapping use like the
>>>>> below (4K base page size):
>>>>> DirectMap4K:       94792 kB
>>>>> DirectMap64K:      134208 kB
>>>>> DirectMap2M:     1173504 kB
>>>>> DirectMap32M:     5636096 kB
>>>>> DirectMap1G:    529530880 kB
>>>>>
>>>>> Although just the machines which support BBML2_NOABORT can split the
>>>>> direct mapping, show it on all machines regardless of BBML2_NOABORT so
>>>>> that the users have consistent view in order to avoid confusion.
>>>>>
>>>>> Although ptdump also can tell the direct map use, but it needs to dump
>>>>> the whole kernel page table. It is costly and overkilling. It is also
>>>>> in debugfs which may not be enabled by all distros. So showing direct
>>>>> map use in /proc/meminfo seems more convenient and has less overhead.
>>>>>
>>>>> Signed-off-by: Yang Shi<yang@os.amperecomputing.com>
>>>>> ---
>>>>> v5: * Rebased to v6.19-rc4
>>>>>       * Fixed the build error for !CONFIG_PROC_FS
>>>>> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>>>>>       * Used shorter name for the helpers and variables per Ryan
>>>>>       * Fixed accounting for memory hotunplug
>>>>> v3: * Fixed the over-accounting problems per Ryan
>>>>>       * Introduced helpers for add/sub direct map use and #ifdef them with
>>>>>         CONFIG_PROC_FS per Ryan
>>>>>       * v3 is a fix patch on top of v2
>>>>> v2: * Counted in size instead of the number of entries per Ryan
>>>>>       * Removed shift array per Ryan
>>>>>       * Use lower case "k" per Ryan
>>>>>       * Fixed a couple of build warnings reported by kernel test robot
>>>>>       * Fixed a couple of poential miscounts
>>>>>
>>>>>    arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++++++-----
>>>>>    1 file changed, 181 insertions(+), 21 deletions(-)
>>>>>
>>>>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>>>>> index 8e1d80a7033e..422441c9a992 100644
>>>>> --- a/arch/arm64/mm/mmu.c
>>>>> +++ b/arch/arm64/mm/mmu.c
>>>>> @@ -29,6 +29,7 @@
>>>>>    #include <linux/mm_inline.h>
>>>>>    #include <linux/pagewalk.h>
>>>>>    #include <linux/stop_machine.h>
>>>>> +#include <linux/proc_fs.h>
>>>>>      #include <asm/barrier.h>
>>>>>    #include <asm/cputype.h>
>>>>> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>>>>>        dsb(ishst);
>>>>>    }
>>>>>    +enum dm_type {
>>>>> +    PTE,
>>>>> +    CONT_PTE,
>>>>> +    PMD,
>>>>> +    CONT_PMD,
>>>>> +    PUD,
>>>>> +    NR_DM_TYPE,
>>>>> +};
>>>>> +
>>>>> +#ifdef CONFIG_PROC_FS
>>>>> +static unsigned long dm_meminfo[NR_DM_TYPE];
>>>>> +
>>>>> +void arch_report_meminfo(struct seq_file *m)
>>>>> +{
>>>>> +    char *size[NR_DM_TYPE];
>>>> const?
>>>>
>>>>> +
>>>>> +#if defined(CONFIG_ARM64_4K_PAGES)
>>>>> +    size[PTE] = "4k";
>>>>> +    size[CONT_PTE] = "64k";
>>>>> +    size[PMD] = "2M";
>>>>> +    size[CONT_PMD] = "32M";
>>>>> +    size[PUD] = "1G";
>>>>> +#elif defined(CONFIG_ARM64_16K_PAGES)
>>>>> +    size[PTE] = "16k";
>>>>> +    size[CONT_PTE] = "2M";
>>>>> +    size[PMD] = "32M";
>>>>> +    size[CONT_PMD] = "1G";
>>>>> +#elif defined(CONFIG_ARM64_64K_PAGES)
>>>>> +    size[PTE] = "64k";
>>>>> +    size[CONT_PTE] = "2M";
>>>>> +    size[PMD] = "512M";
>>>>> +    size[CONT_PMD] = "16G";
>>>>> +#endif
>>>>> +
>>>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>>>> +            size[PTE], dm_meminfo[PTE] >> 10);
>>>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>>>> +            size[CONT_PTE],
>>>>> +            dm_meminfo[CONT_PTE] >> 10);
>>>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>>>> +            size[PMD], dm_meminfo[PMD] >> 10);
>>>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>>>> +            size[CONT_PMD],
>>>>> +            dm_meminfo[CONT_PMD] >> 10);
>>>>> +    if (pud_sect_supported())
>>>>> +        seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>>>> +            size[PUD], dm_meminfo[PUD] >> 10);
>>>> This seems a bit brittle to me. If somebody adds support for l1 block
>>>> mappings for !4k pages in future, they will forget to update this and
>>>> we'll end up returning kernel stack in /proc/meminfo afaict.
>>>>
>>>>> +static inline bool is_dm_addr(unsigned long addr)
>>>>> +{
>>>>> +    return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
>>>>> +}
>>>>> +
>>>>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>>>>> +                  enum dm_type type)
>>>>> +{
>>>>> +    if (is_dm_addr(addr))
>>>>> +        dm_meminfo[type] += size;
>>>>> +}
>>>>> +
>>>>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>>>>> +                  enum dm_type type)
>>>>> +{
>>>>> +    if (is_dm_addr(addr))
>>>>> +        dm_meminfo[type] -= size;
>>>>> +}
>>>>> +#else
>>>>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>>>>> +                  enum dm_type type)
>>>>> +{
>>>>> +}
>>>>> +
>>>>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>>>>> +                  enum dm_type type)
>>>>> +{
>>>>> +}
>>>>> +#endif
>>>>> +
>>>>>    static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>>>>>                 phys_addr_t phys, pgprot_t prot)
>>>>>    {
>>>>> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned
>>>>> long addr,
>>>>>              init_pte(ptep, addr, next, phys, __prot);
>>>>>    +        if (pgprot_val(__prot) & PTE_CONT)
>>>>> +            dm_meminfo_add(addr, (next - addr), CONT_PTE);
>>>>> +        else
>>>>> +            dm_meminfo_add(addr, (next - addr), PTE);
>>>>> +
>>>>>            ptep += pte_index(next) - pte_index(addr);
>>>>>            phys += next - addr;
>>>>>        } while (addr = next, addr != end);
>>>>> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr,
>>>>> unsigned long end,
>>>>>                (flags & NO_BLOCK_MAPPINGS) == 0) {
>>>>>                pmd_set_huge(pmdp, phys, prot);
>>>>>    +            /*
>>>>> +             * It is possible to have mappings allow cont mapping
>>>>> +             * but disallow block mapping. For example,
>>>>> +             * map_entry_trampoline().
>>>>> +             * So we have to increase CONT_PMD and PMD size here
>>>>> +             * to avoid double counting.
>>>>> +             */
>>>>> +            if (pgprot_val(prot) & PTE_CONT)
>>>>> +                dm_meminfo_add(addr, (next - addr), CONT_PMD);
>>>>> +            else
>>>>> +                dm_meminfo_add(addr, (next - addr), PMD);
>>>> I don't understand the comment you're adding here. If somebody passes
>>>> NO_BLOCK_MAPPINGS then that also prevents contiguous entries except at
>>>> level 3.
>>>>
>>>> It also doesn't look you handle the error case properly when the mapping
>>>> fails.
>>>>
>>>>> -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>>>>> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>>>>>                        unsigned long end, bool free_mapped,
>>>>>                        struct vmem_altmap *altmap)
>>>>>    {
>>>>> -    pte_t *ptep, pte;
>>>>> +    pte_t pte;
>>>>>          do {
>>>>> -        ptep = pte_offset_kernel(pmdp, addr);
>>>>>            pte = __ptep_get(ptep);
>>>>>            if (pte_none(pte))
>>>>>                continue;
>>>>>              WARN_ON(!pte_present(pte));
>>>>>            __pte_clear(&init_mm, addr, ptep);
>>>>> +        dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>>>>>            flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>>>>            if (free_mapped)
>>>>>                free_hotplug_page_range(pte_page(pte),
>>>>>                            PAGE_SIZE, altmap);
>>>> Is the existing code correct for contiguous entries here? I'd have
>>>> thought that we'd need to make the range non-contiguous before knocking
>>>> out the TLB.
>>> The Arm ARM has this, which makes me think you are probably correct:
>>>
>>> IVNXYF:
>>> The architecture does not require descriptors with the Contiguous bit set to 1
>>> to be cached as a single TLB entry for the contiguous region. To avoid TLB
>>> coherency issues, software is required to perform TLB maintenance on the entire
>>> address region that results from using the Contiguous bit.
>>>
>>> I've asked for clarification internally. But I think we should hoist out the tlb
>>> flush regardless because it will be faster if we just invalidate a single range.
>>> I can handle that as a separate patch if you like.
>> Thanks, Ryan.
> Of course it's not quite as simple as hoisting for the vmemmap unmapping case
> since that is also freeing the memory, so we need to issue the tlbi before freeing.
>
> vmemmap never uses contiguous mappings so could continue with the current
> strategy for that and only hoist the tlbi-range for the unmapping the linear map
> case.
>
> Or could do a 2 phase approach for vmemmap where we first set VALID=0 for all
> entries, then flush tlb, then walk again, to free clear the pte and free the
> pointed to page.
>
> Other ideas welcome... I'll have a play.
>
>>> However, I think there may be another problem; IIUC, any old range of memory can
>>> be hot-unplugged as long as it is section aligned. It doesn't have to be the
>>> same range that was previously hot-plugged. But if the linear map is block
>>> mapped, the range being unplugged may cover a partial block mapping.
>>>
>>> For example, with 4K pages, the section size is 128M, so you could hot unmap
>>> 128M from a PUD leaf mapping (1G). What am I missing that means this doesn't go
>>> bang?
>>>
>>> This would have been an issue for the non-rodata-full config so predates the
>>> work to split the linear map dynamically. I'm not really sure how to solve this
>>> for systems without BBML2 but without non--rodata-full.
>>>
>>> I must be misunderstanding something crucial here... I'll dig some more.
>> I'm not expert on memory hotplug. I'm not 100% confident my understanding is
>> correct. But I noticed something that I misunderstood before, when I was testing
>> the patch.
>>
>> The hotunplug actually has two stages: offline and unplug (physically remove the
>> device).
>>
>> When we echo offline to the sysfs file, it actually just does offline. The
>> offline just isolates the memory from buddy, but it does *NOT* unmap the memory
>> from the linear mapping. The linear mapping will be unmapped at unplug stage. I
>> tested the patch with QEMU, I just can emulate hotplug/hotunplug the whole dimm,
>> for example, hotplug 1G, then hotunplug the same 1G. I can't emulate hotunplug
>> in the smaller size. I thought it may be the limitation of QEMU in the first
>> place. But I realized we can't hotunplug a part of dimm physically either,
>> right? For example, we insert 1G dimm to the board, we can't take out 128M from
>> the dimm physically. So IIUC the partial unmap of linear mapping should never
>> happen.
> Looking at the code, it looks to me like memory_hotplug.c doesn't care and will
> try to unplug any span of memory that it is asked to, as long as start and end
> are aligned to memory_block_size_bytes() (which for arm64 is section size = 128M
> for 4K base pages).
>
> But it looks like all the higher level users will only ever unplug in the same
> granularity that was plugged in (I might be wrong but that's the sense I get).
>
> arm64 adds the constraint that it won't unplug any memory that was present at
> boot - see prevent_bootmem_remove_notifier().
>
> So in practice this is probably safe, though perhaps brittle.
>
> Some options:
>
>   - leave it as is and worry about it if/when something shifts and hits the
>     problem.

Seems like the most simple way :-)

>   - Enhance prevent_bootmem_remove_notifier() to reject unplugging memory blocks
>     whose boundaries are within leaf mappings.

I don't quite get why we should enhance 
prevent_bootmem_remove_notifier(). If I read the code correctly, it just 
simply reject offline boot memory. Offlining a single memory block is 
fine. If you check the boundaries there, will it prevent from offlining 
a single memory block?

I think you need enhance try_remove_memory(). But kernel may unmap 
linear mapping by memory blocks if altmap is used. So you should need an 
extra page table walk with the start and the size of unplugged dimm 
before removing the memory to tell whether the boundaries are within 
leaf mappings or not IIUC. Can it be done in arch_remove_memory()? It 
seems not because arch_remove_memory() may be called on memory block 
granularity if altmap is used.

>   - For non-bbml2_noabort systems, map hotplug memory with a new flag to ensure
>     that leaf mappings are always <= memory_block_size_bytes(). For
>     bbml2_noabort, split at the block boundaries before doing the unmapping.

The linear mapping will be at most 128M (4K page size), it sounds sub 
optimal IMHO.

> Given I don't think this can happen in practice, probably the middle option is
> the best? There is no runtime impact and it will give us a warning if it ever
> does happen in future.
>
> What do you think?

I agree it can't happen in practice, so why not just take option #1 
given the complexity added by option #2?

Thanks,
Yang

> Thanks,
> Ryan
>
>> If I read the code correctly, the code does unmap the linear mapping on memory
>> block granularity. The block linear mapping is unmapped when removing the first
>> block if it covers multiple memory blocks. Then the page table will be none when
>> removing the later blocks, but it is ok, the code just continue.
>>
>> Thanks,
>> Yang
>>
>>> Thanks,
>>> Ryan
>>>
>>>
>>>
>>>> Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Will Deacon 1 week, 4 days ago

On Thu, Jan 22, 2026 at 01:59:54PM -0800, Yang Shi wrote:
> On 1/22/26 6:43 AM, Ryan Roberts wrote:
> > On 21/01/2026 22:44, Yang Shi wrote:
> > > On 1/21/26 9:23 AM, Ryan Roberts wrote:
> > But it looks like all the higher level users will only ever unplug in the same
> > granularity that was plugged in (I might be wrong but that's the sense I get).
> > 
> > arm64 adds the constraint that it won't unplug any memory that was present at
> > boot - see prevent_bootmem_remove_notifier().
> > 
> > So in practice this is probably safe, though perhaps brittle.
> > 
> > Some options:
> > 
> >   - leave it as is and worry about it if/when something shifts and hits the
> >     problem.
> 
> Seems like the most simple way :-)
> 
> >   - Enhance prevent_bootmem_remove_notifier() to reject unplugging memory blocks
> >     whose boundaries are within leaf mappings.
> 
> I don't quite get why we should enhance prevent_bootmem_remove_notifier().
> If I read the code correctly, it just simply reject offline boot memory.
> Offlining a single memory block is fine. If you check the boundaries there,
> will it prevent from offlining a single memory block?
> 
> I think you need enhance try_remove_memory(). But kernel may unmap linear
> mapping by memory blocks if altmap is used. So you should need an extra page
> table walk with the start and the size of unplugged dimm before removing the
> memory to tell whether the boundaries are within leaf mappings or not IIUC.
> Can it be done in arch_remove_memory()? It seems not because
> arch_remove_memory() may be called on memory block granularity if altmap is
> used.
> 
> >   - For non-bbml2_noabort systems, map hotplug memory with a new flag to ensure
> >     that leaf mappings are always <= memory_block_size_bytes(). For
> >     bbml2_noabort, split at the block boundaries before doing the unmapping.
> 
> The linear mapping will be at most 128M (4K page size), it sounds sub
> optimal IMHO.
> 
> > Given I don't think this can happen in practice, probably the middle option is
> > the best? There is no runtime impact and it will give us a warning if it ever
> > does happen in future.
> > 
> > What do you think?
> 
> I agree it can't happen in practice, so why not just take option #1 given
> the complexity added by option #2?

It still looks broken in the case that a region that was mapped with the
contiguous bit is then unmapped. The sequence seems to iterate over
each contiguous PTE, zapping the entry and doing the TLBI while the
other entries in the contiguous range remain intact. I don't think
that's sufficient to guarantee that you don't have stale TLB entries
once you've finished processing the whole range.

For example, imagine you have an L1 TLB that only supports 4k entries
and an L2 TLB that supports 64k entries. Let's say that the contiguous
range is mapped by pte0 ... pte15 and we've zapped and invalidated
pte0 ... pte14. At that point, I think the hardware is permitted to use
the last remaining contiguous pte (pte15) to allocate a 64k entry in the
L2 TLB covering the whole range. A (speculative) walk via one of the
virtual addresses translated by pte0 ... pte14 could then hit that entry
and fill a 4k entry into the L1 TLB. So at the end of the sequence, you
could presumably still access the first 60k of the range thanks to stale
entries in the L1 TLB?

So it looks broken to me. What do you think? If you agree, then let's
fix this problem first before adding the new /proc/meminfo stuff.

Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 1 week, 4 days ago


On 1/26/26 6:14 AM, Will Deacon wrote:
> On Thu, Jan 22, 2026 at 01:59:54PM -0800, Yang Shi wrote:
>> On 1/22/26 6:43 AM, Ryan Roberts wrote:
>>> On 21/01/2026 22:44, Yang Shi wrote:
>>>> On 1/21/26 9:23 AM, Ryan Roberts wrote:
>>> But it looks like all the higher level users will only ever unplug in the same
>>> granularity that was plugged in (I might be wrong but that's the sense I get).
>>>
>>> arm64 adds the constraint that it won't unplug any memory that was present at
>>> boot - see prevent_bootmem_remove_notifier().
>>>
>>> So in practice this is probably safe, though perhaps brittle.
>>>
>>> Some options:
>>>
>>>    - leave it as is and worry about it if/when something shifts and hits the
>>>      problem.
>> Seems like the most simple way :-)
>>
>>>    - Enhance prevent_bootmem_remove_notifier() to reject unplugging memory blocks
>>>      whose boundaries are within leaf mappings.
>> I don't quite get why we should enhance prevent_bootmem_remove_notifier().
>> If I read the code correctly, it just simply reject offline boot memory.
>> Offlining a single memory block is fine. If you check the boundaries there,
>> will it prevent from offlining a single memory block?
>>
>> I think you need enhance try_remove_memory(). But kernel may unmap linear
>> mapping by memory blocks if altmap is used. So you should need an extra page
>> table walk with the start and the size of unplugged dimm before removing the
>> memory to tell whether the boundaries are within leaf mappings or not IIUC.
>> Can it be done in arch_remove_memory()? It seems not because
>> arch_remove_memory() may be called on memory block granularity if altmap is
>> used.
>>
>>>    - For non-bbml2_noabort systems, map hotplug memory with a new flag to ensure
>>>      that leaf mappings are always <= memory_block_size_bytes(). For
>>>      bbml2_noabort, split at the block boundaries before doing the unmapping.
>> The linear mapping will be at most 128M (4K page size), it sounds sub
>> optimal IMHO.
>>
>>> Given I don't think this can happen in practice, probably the middle option is
>>> the best? There is no runtime impact and it will give us a warning if it ever
>>> does happen in future.
>>>
>>> What do you think?
>> I agree it can't happen in practice, so why not just take option #1 given
>> the complexity added by option #2?
> It still looks broken in the case that a region that was mapped with the
> contiguous bit is then unmapped. The sequence seems to iterate over
> each contiguous PTE, zapping the entry and doing the TLBI while the
> other entries in the contiguous range remain intact. I don't think
> that's sufficient to guarantee that you don't have stale TLB entries
> once you've finished processing the whole range.
>
> For example, imagine you have an L1 TLB that only supports 4k entries
> and an L2 TLB that supports 64k entries. Let's say that the contiguous
> range is mapped by pte0 ... pte15 and we've zapped and invalidated
> pte0 ... pte14. At that point, I think the hardware is permitted to use
> the last remaining contiguous pte (pte15) to allocate a 64k entry in the
> L2 TLB covering the whole range. A (speculative) walk via one of the
> virtual addresses translated by pte0 ... pte14 could then hit that entry
> and fill a 4k entry into the L1 TLB. So at the end of the sequence, you
> could presumably still access the first 60k of the range thanks to stale
> entries in the L1 TLB?

It is a little bit hard for me to understand how come a (speculative) 
walk could happen when we reach here.

Before we reach here, IIUC kernel has:

  * offlined all the page blocks. It means they are freed and isolated 
from buddy allocator, even pfn walk (for example, compaction) should not 
reach them at all.
  * vmemmap has been eliminated. So no struct page available.

 From kernel point of view, they are nonreachable now. Did I miss and/or 
misunderstand something?

Thanks,
Yang

>
> So it looks broken to me. What do you think? If you agree, then let's
> fix this problem first before adding the new /proc/meminfo stuff.
>
> Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Will Deacon 1 week, 4 days ago

On Mon, Jan 26, 2026 at 09:55:06AM -0800, Yang Shi wrote:
> 
> 
> On 1/26/26 6:14 AM, Will Deacon wrote:
> > On Thu, Jan 22, 2026 at 01:59:54PM -0800, Yang Shi wrote:
> > > On 1/22/26 6:43 AM, Ryan Roberts wrote:
> > > > On 21/01/2026 22:44, Yang Shi wrote:
> > > > > On 1/21/26 9:23 AM, Ryan Roberts wrote:
> > > > But it looks like all the higher level users will only ever unplug in the same
> > > > granularity that was plugged in (I might be wrong but that's the sense I get).
> > > > 
> > > > arm64 adds the constraint that it won't unplug any memory that was present at
> > > > boot - see prevent_bootmem_remove_notifier().
> > > > 
> > > > So in practice this is probably safe, though perhaps brittle.
> > > > 
> > > > Some options:
> > > > 
> > > >    - leave it as is and worry about it if/when something shifts and hits the
> > > >      problem.
> > > Seems like the most simple way :-)
> > > 
> > > >    - Enhance prevent_bootmem_remove_notifier() to reject unplugging memory blocks
> > > >      whose boundaries are within leaf mappings.
> > > I don't quite get why we should enhance prevent_bootmem_remove_notifier().
> > > If I read the code correctly, it just simply reject offline boot memory.
> > > Offlining a single memory block is fine. If you check the boundaries there,
> > > will it prevent from offlining a single memory block?
> > > 
> > > I think you need enhance try_remove_memory(). But kernel may unmap linear
> > > mapping by memory blocks if altmap is used. So you should need an extra page
> > > table walk with the start and the size of unplugged dimm before removing the
> > > memory to tell whether the boundaries are within leaf mappings or not IIUC.
> > > Can it be done in arch_remove_memory()? It seems not because
> > > arch_remove_memory() may be called on memory block granularity if altmap is
> > > used.
> > > 
> > > >    - For non-bbml2_noabort systems, map hotplug memory with a new flag to ensure
> > > >      that leaf mappings are always <= memory_block_size_bytes(). For
> > > >      bbml2_noabort, split at the block boundaries before doing the unmapping.
> > > The linear mapping will be at most 128M (4K page size), it sounds sub
> > > optimal IMHO.
> > > 
> > > > Given I don't think this can happen in practice, probably the middle option is
> > > > the best? There is no runtime impact and it will give us a warning if it ever
> > > > does happen in future.
> > > > 
> > > > What do you think?
> > > I agree it can't happen in practice, so why not just take option #1 given
> > > the complexity added by option #2?
> > It still looks broken in the case that a region that was mapped with the
> > contiguous bit is then unmapped. The sequence seems to iterate over
> > each contiguous PTE, zapping the entry and doing the TLBI while the
> > other entries in the contiguous range remain intact. I don't think
> > that's sufficient to guarantee that you don't have stale TLB entries
> > once you've finished processing the whole range.
> > 
> > For example, imagine you have an L1 TLB that only supports 4k entries
> > and an L2 TLB that supports 64k entries. Let's say that the contiguous
> > range is mapped by pte0 ... pte15 and we've zapped and invalidated
> > pte0 ... pte14. At that point, I think the hardware is permitted to use
> > the last remaining contiguous pte (pte15) to allocate a 64k entry in the
> > L2 TLB covering the whole range. A (speculative) walk via one of the
> > virtual addresses translated by pte0 ... pte14 could then hit that entry
> > and fill a 4k entry into the L1 TLB. So at the end of the sequence, you
> > could presumably still access the first 60k of the range thanks to stale
> > entries in the L1 TLB?
> 
> It is a little bit hard for me to understand how come a (speculative) walk
> could happen when we reach here.
> 
> Before we reach here, IIUC kernel has:
> 
>  * offlined all the page blocks. It means they are freed and isolated from
> buddy allocator, even pfn walk (for example, compaction) should not reach
> them at all.
>  * vmemmap has been eliminated. So no struct page available.
> 
> From kernel point of view, they are nonreachable now. Did I miss and/or
> misunderstand something?

I'm talking about hardware speculation. It's mapped as normal memory so
the CPU can speculate from it. We can't really reason about the bounds
of that, especially in a world with branch predictors and history-based
prefetchers.

Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 1 week, 4 days ago


On 1/26/26 10:58 AM, Will Deacon wrote:
> On Mon, Jan 26, 2026 at 09:55:06AM -0800, Yang Shi wrote:
>>
>> On 1/26/26 6:14 AM, Will Deacon wrote:
>>> On Thu, Jan 22, 2026 at 01:59:54PM -0800, Yang Shi wrote:
>>>> On 1/22/26 6:43 AM, Ryan Roberts wrote:
>>>>> On 21/01/2026 22:44, Yang Shi wrote:
>>>>>> On 1/21/26 9:23 AM, Ryan Roberts wrote:
>>>>> But it looks like all the higher level users will only ever unplug in the same
>>>>> granularity that was plugged in (I might be wrong but that's the sense I get).
>>>>>
>>>>> arm64 adds the constraint that it won't unplug any memory that was present at
>>>>> boot - see prevent_bootmem_remove_notifier().
>>>>>
>>>>> So in practice this is probably safe, though perhaps brittle.
>>>>>
>>>>> Some options:
>>>>>
>>>>>     - leave it as is and worry about it if/when something shifts and hits the
>>>>>       problem.
>>>> Seems like the most simple way :-)
>>>>
>>>>>     - Enhance prevent_bootmem_remove_notifier() to reject unplugging memory blocks
>>>>>       whose boundaries are within leaf mappings.
>>>> I don't quite get why we should enhance prevent_bootmem_remove_notifier().
>>>> If I read the code correctly, it just simply reject offline boot memory.
>>>> Offlining a single memory block is fine. If you check the boundaries there,
>>>> will it prevent from offlining a single memory block?
>>>>
>>>> I think you need enhance try_remove_memory(). But kernel may unmap linear
>>>> mapping by memory blocks if altmap is used. So you should need an extra page
>>>> table walk with the start and the size of unplugged dimm before removing the
>>>> memory to tell whether the boundaries are within leaf mappings or not IIUC.
>>>> Can it be done in arch_remove_memory()? It seems not because
>>>> arch_remove_memory() may be called on memory block granularity if altmap is
>>>> used.
>>>>
>>>>>     - For non-bbml2_noabort systems, map hotplug memory with a new flag to ensure
>>>>>       that leaf mappings are always <= memory_block_size_bytes(). For
>>>>>       bbml2_noabort, split at the block boundaries before doing the unmapping.
>>>> The linear mapping will be at most 128M (4K page size), it sounds sub
>>>> optimal IMHO.
>>>>
>>>>> Given I don't think this can happen in practice, probably the middle option is
>>>>> the best? There is no runtime impact and it will give us a warning if it ever
>>>>> does happen in future.
>>>>>
>>>>> What do you think?
>>>> I agree it can't happen in practice, so why not just take option #1 given
>>>> the complexity added by option #2?
>>> It still looks broken in the case that a region that was mapped with the
>>> contiguous bit is then unmapped. The sequence seems to iterate over
>>> each contiguous PTE, zapping the entry and doing the TLBI while the
>>> other entries in the contiguous range remain intact. I don't think
>>> that's sufficient to guarantee that you don't have stale TLB entries
>>> once you've finished processing the whole range.
>>>
>>> For example, imagine you have an L1 TLB that only supports 4k entries
>>> and an L2 TLB that supports 64k entries. Let's say that the contiguous
>>> range is mapped by pte0 ... pte15 and we've zapped and invalidated
>>> pte0 ... pte14. At that point, I think the hardware is permitted to use
>>> the last remaining contiguous pte (pte15) to allocate a 64k entry in the
>>> L2 TLB covering the whole range. A (speculative) walk via one of the
>>> virtual addresses translated by pte0 ... pte14 could then hit that entry
>>> and fill a 4k entry into the L1 TLB. So at the end of the sequence, you
>>> could presumably still access the first 60k of the range thanks to stale
>>> entries in the L1 TLB?
>> It is a little bit hard for me to understand how come a (speculative) walk
>> could happen when we reach here.
>>
>> Before we reach here, IIUC kernel has:
>>
>>   * offlined all the page blocks. It means they are freed and isolated from
>> buddy allocator, even pfn walk (for example, compaction) should not reach
>> them at all.
>>   * vmemmap has been eliminated. So no struct page available.
>>
>>  From kernel point of view, they are nonreachable now. Did I miss and/or
>> misunderstand something?
> I'm talking about hardware speculation. It's mapped as normal memory so
> the CPU can speculate from it. We can't really reason about the bounds
> of that, especially in a world with branch predictors and history-based
> prefetchers.

OK. If it could happen, I think the suggestions from you and Ryan should 
work IIUC:

Clear all the entries in the cont range, then invalidate TLB for the 
whole range.

I can come up with a patch or Ryan would like to take it?

Thanks,
Yang

>
> Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Ryan Roberts 1 week, 4 days ago

On 26/01/2026 20:50, Yang Shi wrote:
> 
> 
> On 1/26/26 10:58 AM, Will Deacon wrote:
>> On Mon, Jan 26, 2026 at 09:55:06AM -0800, Yang Shi wrote:
>>>
>>> On 1/26/26 6:14 AM, Will Deacon wrote:
>>>> On Thu, Jan 22, 2026 at 01:59:54PM -0800, Yang Shi wrote:
>>>>> On 1/22/26 6:43 AM, Ryan Roberts wrote:
>>>>>> On 21/01/2026 22:44, Yang Shi wrote:
>>>>>>> On 1/21/26 9:23 AM, Ryan Roberts wrote:
>>>>>> But it looks like all the higher level users will only ever unplug in the
>>>>>> same
>>>>>> granularity that was plugged in (I might be wrong but that's the sense I
>>>>>> get).
>>>>>>
>>>>>> arm64 adds the constraint that it won't unplug any memory that was present at
>>>>>> boot - see prevent_bootmem_remove_notifier().
>>>>>>
>>>>>> So in practice this is probably safe, though perhaps brittle.
>>>>>>
>>>>>> Some options:
>>>>>>
>>>>>>     - leave it as is and worry about it if/when something shifts and hits the
>>>>>>       problem.
>>>>> Seems like the most simple way :-)
>>>>>
>>>>>>     - Enhance prevent_bootmem_remove_notifier() to reject unplugging
>>>>>> memory blocks
>>>>>>       whose boundaries are within leaf mappings.
>>>>> I don't quite get why we should enhance prevent_bootmem_remove_notifier().
>>>>> If I read the code correctly, it just simply reject offline boot memory.
>>>>> Offlining a single memory block is fine. If you check the boundaries there,
>>>>> will it prevent from offlining a single memory block?
>>>>>
>>>>> I think you need enhance try_remove_memory(). But kernel may unmap linear
>>>>> mapping by memory blocks if altmap is used. So you should need an extra page
>>>>> table walk with the start and the size of unplugged dimm before removing the
>>>>> memory to tell whether the boundaries are within leaf mappings or not IIUC.
>>>>> Can it be done in arch_remove_memory()? It seems not because
>>>>> arch_remove_memory() may be called on memory block granularity if altmap is
>>>>> used.
>>>>>
>>>>>>     - For non-bbml2_noabort systems, map hotplug memory with a new flag to
>>>>>> ensure
>>>>>>       that leaf mappings are always <= memory_block_size_bytes(). For
>>>>>>       bbml2_noabort, split at the block boundaries before doing the
>>>>>> unmapping.
>>>>> The linear mapping will be at most 128M (4K page size), it sounds sub
>>>>> optimal IMHO.
>>>>>
>>>>>> Given I don't think this can happen in practice, probably the middle
>>>>>> option is
>>>>>> the best? There is no runtime impact and it will give us a warning if it ever
>>>>>> does happen in future.
>>>>>>
>>>>>> What do you think?
>>>>> I agree it can't happen in practice, so why not just take option #1 given
>>>>> the complexity added by option #2?
>>>> It still looks broken in the case that a region that was mapped with the
>>>> contiguous bit is then unmapped. The sequence seems to iterate over
>>>> each contiguous PTE, zapping the entry and doing the TLBI while the
>>>> other entries in the contiguous range remain intact. I don't think
>>>> that's sufficient to guarantee that you don't have stale TLB entries
>>>> once you've finished processing the whole range.
>>>>
>>>> For example, imagine you have an L1 TLB that only supports 4k entries
>>>> and an L2 TLB that supports 64k entries. Let's say that the contiguous
>>>> range is mapped by pte0 ... pte15 and we've zapped and invalidated
>>>> pte0 ... pte14. At that point, I think the hardware is permitted to use
>>>> the last remaining contiguous pte (pte15) to allocate a 64k entry in the
>>>> L2 TLB covering the whole range. A (speculative) walk via one of the
>>>> virtual addresses translated by pte0 ... pte14 could then hit that entry
>>>> and fill a 4k entry into the L1 TLB. So at the end of the sequence, you
>>>> could presumably still access the first 60k of the range thanks to stale
>>>> entries in the L1 TLB?
>>> It is a little bit hard for me to understand how come a (speculative) walk
>>> could happen when we reach here.
>>>
>>> Before we reach here, IIUC kernel has:
>>>
>>>   * offlined all the page blocks. It means they are freed and isolated from
>>> buddy allocator, even pfn walk (for example, compaction) should not reach
>>> them at all.
>>>   * vmemmap has been eliminated. So no struct page available.
>>>
>>>  From kernel point of view, they are nonreachable now. Did I miss and/or
>>> misunderstand something?
>> I'm talking about hardware speculation. It's mapped as normal memory so
>> the CPU can speculate from it. We can't really reason about the bounds
>> of that, especially in a world with branch predictors and history-based
>> prefetchers.
> 
> OK. If it could happen, I think the suggestions from you and Ryan should work IIUC:
> 
> Clear all the entries in the cont range, then invalidate TLB for the whole range.
> 
> I can come up with a patch or Ryan would like to take it?

Hi,

There are 2 separate issues that have been raised here and I think we are
conflating them a bit...

1: The contiguous range teardown + tlbi issue that Will raised. That is
definitely a problem and needs to be fixed. (though I think prior to the BBML2
dynamic linear block mapping support it would be rare in practice; probably it
would only affect cont-pmd mappings for 16K and 64K base page configs. With
BBML2 dynamic linear block mapping support, this can happen for contiguous
mappings at all levels with all base page sizes).

I roughed out a patch to hoist out the tlbis and issue as a single range after
clearing all the pgtable entries. I think this will be MUCH faster and will
solve the contiguous issue too. The one catch is that this only works for linear
map and the same helpers are used for the vmemmap. For the latter we also free
the memory, so the tlbis need to happen before the freeing. But vmemmap doesn't
use contiguous mappings so I've added a warning checking that and use a
different scheme based on whether we are freeing or not.

Anshuman has kindly agreed to knock the patch into shape and do the testing.
Hopefully he can post shortly.

2: hot-unplugging a range that starts or terminates in the middle of a large
leaf mapping. The low level hot-unplug implementation allows unplugging any
range of memory as long as it is section size aligned (128M). So theoretically
you could have a 1G PUD leaf mapping and try to unplug 128M from the middle of
it. In practice this doesn't happen because all the users of the hot-unplug code
group memory into devices. If you add a range, you can only remove that same
range. When adding, we will guarrantee that the leaf mappings exactly map the
range, so the same guarrantee can be given for hot-remove.

BUT, that feels fragile to me. I'd like to add a check in
prevent_bootmem_remove_notifier() to ensure that the proposed unplug range is
exactly covered by leaf mappings, and if it isn't, warn and reject. This will
allow us to fail safe for a tiny amount of overhead (which will be made up for
many, many times over by hoisting the tlbis batching the barriers in 1.).

Anshuman has also kindly agreed to put a patch together for that.

Thanks,
Ryan

> 
> Thanks,
> Yang
> 
>>
>> Will
>

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 1 week, 3 days ago


On 1/27/26 12:57 AM, Ryan Roberts wrote:
> On 26/01/2026 20:50, Yang Shi wrote:
>>
>> On 1/26/26 10:58 AM, Will Deacon wrote:
>>> On Mon, Jan 26, 2026 at 09:55:06AM -0800, Yang Shi wrote:
>>>> On 1/26/26 6:14 AM, Will Deacon wrote:
>>>>> On Thu, Jan 22, 2026 at 01:59:54PM -0800, Yang Shi wrote:
>>>>>> On 1/22/26 6:43 AM, Ryan Roberts wrote:
>>>>>>> On 21/01/2026 22:44, Yang Shi wrote:
>>>>>>>> On 1/21/26 9:23 AM, Ryan Roberts wrote:
>>>>>>> But it looks like all the higher level users will only ever unplug in the
>>>>>>> same
>>>>>>> granularity that was plugged in (I might be wrong but that's the sense I
>>>>>>> get).
>>>>>>>
>>>>>>> arm64 adds the constraint that it won't unplug any memory that was present at
>>>>>>> boot - see prevent_bootmem_remove_notifier().
>>>>>>>
>>>>>>> So in practice this is probably safe, though perhaps brittle.
>>>>>>>
>>>>>>> Some options:
>>>>>>>
>>>>>>>      - leave it as is and worry about it if/when something shifts and hits the
>>>>>>>        problem.
>>>>>> Seems like the most simple way :-)
>>>>>>
>>>>>>>      - Enhance prevent_bootmem_remove_notifier() to reject unplugging
>>>>>>> memory blocks
>>>>>>>        whose boundaries are within leaf mappings.
>>>>>> I don't quite get why we should enhance prevent_bootmem_remove_notifier().
>>>>>> If I read the code correctly, it just simply reject offline boot memory.
>>>>>> Offlining a single memory block is fine. If you check the boundaries there,
>>>>>> will it prevent from offlining a single memory block?
>>>>>>
>>>>>> I think you need enhance try_remove_memory(). But kernel may unmap linear
>>>>>> mapping by memory blocks if altmap is used. So you should need an extra page
>>>>>> table walk with the start and the size of unplugged dimm before removing the
>>>>>> memory to tell whether the boundaries are within leaf mappings or not IIUC.
>>>>>> Can it be done in arch_remove_memory()? It seems not because
>>>>>> arch_remove_memory() may be called on memory block granularity if altmap is
>>>>>> used.
>>>>>>
>>>>>>>      - For non-bbml2_noabort systems, map hotplug memory with a new flag to
>>>>>>> ensure
>>>>>>>        that leaf mappings are always <= memory_block_size_bytes(). For
>>>>>>>        bbml2_noabort, split at the block boundaries before doing the
>>>>>>> unmapping.
>>>>>> The linear mapping will be at most 128M (4K page size), it sounds sub
>>>>>> optimal IMHO.
>>>>>>
>>>>>>> Given I don't think this can happen in practice, probably the middle
>>>>>>> option is
>>>>>>> the best? There is no runtime impact and it will give us a warning if it ever
>>>>>>> does happen in future.
>>>>>>>
>>>>>>> What do you think?
>>>>>> I agree it can't happen in practice, so why not just take option #1 given
>>>>>> the complexity added by option #2?
>>>>> It still looks broken in the case that a region that was mapped with the
>>>>> contiguous bit is then unmapped. The sequence seems to iterate over
>>>>> each contiguous PTE, zapping the entry and doing the TLBI while the
>>>>> other entries in the contiguous range remain intact. I don't think
>>>>> that's sufficient to guarantee that you don't have stale TLB entries
>>>>> once you've finished processing the whole range.
>>>>>
>>>>> For example, imagine you have an L1 TLB that only supports 4k entries
>>>>> and an L2 TLB that supports 64k entries. Let's say that the contiguous
>>>>> range is mapped by pte0 ... pte15 and we've zapped and invalidated
>>>>> pte0 ... pte14. At that point, I think the hardware is permitted to use
>>>>> the last remaining contiguous pte (pte15) to allocate a 64k entry in the
>>>>> L2 TLB covering the whole range. A (speculative) walk via one of the
>>>>> virtual addresses translated by pte0 ... pte14 could then hit that entry
>>>>> and fill a 4k entry into the L1 TLB. So at the end of the sequence, you
>>>>> could presumably still access the first 60k of the range thanks to stale
>>>>> entries in the L1 TLB?
>>>> It is a little bit hard for me to understand how come a (speculative) walk
>>>> could happen when we reach here.
>>>>
>>>> Before we reach here, IIUC kernel has:
>>>>
>>>>    * offlined all the page blocks. It means they are freed and isolated from
>>>> buddy allocator, even pfn walk (for example, compaction) should not reach
>>>> them at all.
>>>>    * vmemmap has been eliminated. So no struct page available.
>>>>
>>>>   From kernel point of view, they are nonreachable now. Did I miss and/or
>>>> misunderstand something?
>>> I'm talking about hardware speculation. It's mapped as normal memory so
>>> the CPU can speculate from it. We can't really reason about the bounds
>>> of that, especially in a world with branch predictors and history-based
>>> prefetchers.
>> OK. If it could happen, I think the suggestions from you and Ryan should work IIUC:
>>
>> Clear all the entries in the cont range, then invalidate TLB for the whole range.
>>
>> I can come up with a patch or Ryan would like to take it?
> Hi,
>
> There are 2 separate issues that have been raised here and I think we are
> conflating them a bit...
>
>
> 1: The contiguous range teardown + tlbi issue that Will raised. That is
> definitely a problem and needs to be fixed. (though I think prior to the BBML2
> dynamic linear block mapping support it would be rare in practice; probably it
> would only affect cont-pmd mappings for 16K and 64K base page configs. With
> BBML2 dynamic linear block mapping support, this can happen for contiguous
> mappings at all levels with all base page sizes).
>
> I roughed out a patch to hoist out the tlbis and issue as a single range after
> clearing all the pgtable entries. I think this will be MUCH faster and will
> solve the contiguous issue too. The one catch is that this only works for linear
> map and the same helpers are used for the vmemmap. For the latter we also free
> the memory, so the tlbis need to happen before the freeing. But vmemmap doesn't
> use contiguous mappings so I've added a warning checking that and use a
> different scheme based on whether we are freeing or not.
>
> Anshuman has kindly agreed to knock the patch into shape and do the testing.
> Hopefully he can post shortly.
>
>
> 2: hot-unplugging a range that starts or terminates in the middle of a large
> leaf mapping. The low level hot-unplug implementation allows unplugging any
> range of memory as long as it is section size aligned (128M). So theoretically
> you could have a 1G PUD leaf mapping and try to unplug 128M from the middle of
> it. In practice this doesn't happen because all the users of the hot-unplug code
> group memory into devices. If you add a range, you can only remove that same
> range. When adding, we will guarrantee that the leaf mappings exactly map the
> range, so the same guarrantee can be given for hot-remove.
>
> BUT, that feels fragile to me. I'd like to add a check in
> prevent_bootmem_remove_notifier() to ensure that the proposed unplug range is
> exactly covered by leaf mappings, and if it isn't, warn and reject. This will
> allow us to fail safe for a tiny amount of overhead (which will be made up for
> many, many times over by hoisting the tlbis batching the barriers in 1.).
>
> Anshuman has also kindly agreed to put a patch together for that.

Thanks for the update. Look forward to seeing the patches from Anshuman 
soon.

Thanks,
Yang

>
>
> Thanks,
> Ryan
>
>
>> Thanks,
>> Yang
>>
>>> Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 3 weeks, 3 days ago


On 1/13/26 6:36 AM, Will Deacon wrote:
> On Tue, Jan 06, 2026 at 04:29:44PM -0800, Yang Shi wrote:
>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>> rodata=full"), the direct mapping may be split on some machines instead
>> keeping static since boot. It makes more sense to show the direct mapping
>> use in /proc/meminfo than before.
>> This patch will make /proc/meminfo show the direct mapping use like the
>> below (4K base page size):
>> DirectMap4K:	   94792 kB
>> DirectMap64K:	  134208 kB
>> DirectMap2M:	 1173504 kB
>> DirectMap32M:	 5636096 kB
>> DirectMap1G:	529530880 kB
>>
>> Although just the machines which support BBML2_NOABORT can split the
>> direct mapping, show it on all machines regardless of BBML2_NOABORT so
>> that the users have consistent view in order to avoid confusion.
>>
>> Although ptdump also can tell the direct map use, but it needs to dump
>> the whole kernel page table. It is costly and overkilling. It is also
>> in debugfs which may not be enabled by all distros. So showing direct
>> map use in /proc/meminfo seems more convenient and has less overhead.
>>
>> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
>> ---
>> v5: * Rebased to v6.19-rc4
>>      * Fixed the build error for !CONFIG_PROC_FS
>> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>>      * Used shorter name for the helpers and variables per Ryan
>>      * Fixed accounting for memory hotunplug
>> v3: * Fixed the over-accounting problems per Ryan
>>      * Introduced helpers for add/sub direct map use and #ifdef them with
>>        CONFIG_PROC_FS per Ryan
>>      * v3 is a fix patch on top of v2
>> v2: * Counted in size instead of the number of entries per Ryan
>>      * Removed shift array per Ryan
>>      * Use lower case "k" per Ryan
>>      * Fixed a couple of build warnings reported by kernel test robot
>>      * Fixed a couple of poential miscounts
>>
>>   arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++++++-----
>>   1 file changed, 181 insertions(+), 21 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 8e1d80a7033e..422441c9a992 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -29,6 +29,7 @@
>>   #include <linux/mm_inline.h>
>>   #include <linux/pagewalk.h>
>>   #include <linux/stop_machine.h>
>> +#include <linux/proc_fs.h>
>>   
>>   #include <asm/barrier.h>
>>   #include <asm/cputype.h>
>> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>>   	dsb(ishst);
>>   }
>>   
>> +enum dm_type {
>> +	PTE,
>> +	CONT_PTE,
>> +	PMD,
>> +	CONT_PMD,
>> +	PUD,
>> +	NR_DM_TYPE,
>> +};
>> +
>> +#ifdef CONFIG_PROC_FS
>> +static unsigned long dm_meminfo[NR_DM_TYPE];
>> +
>> +void arch_report_meminfo(struct seq_file *m)
>> +{
>> +	char *size[NR_DM_TYPE];
> const?

Yeah, it can be const.

>
>> +
>> +#if defined(CONFIG_ARM64_4K_PAGES)
>> +	size[PTE] = "4k";
>> +	size[CONT_PTE] = "64k";
>> +	size[PMD] = "2M";
>> +	size[CONT_PMD] = "32M";
>> +	size[PUD] = "1G";
>> +#elif defined(CONFIG_ARM64_16K_PAGES)
>> +	size[PTE] = "16k";
>> +	size[CONT_PTE] = "2M";
>> +	size[PMD] = "32M";
>> +	size[CONT_PMD] = "1G";
>> +#elif defined(CONFIG_ARM64_64K_PAGES)
>> +	size[PTE] = "64k";
>> +	size[CONT_PTE] = "2M";
>> +	size[PMD] = "512M";
>> +	size[CONT_PMD] = "16G";
>> +#endif
>> +
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PTE], dm_meminfo[PTE] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[CONT_PTE],
>> +			dm_meminfo[CONT_PTE] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PMD], dm_meminfo[PMD] >> 10);
>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[CONT_PMD],
>> +			dm_meminfo[CONT_PMD] >> 10);
>> +	if (pud_sect_supported())
>> +		seq_printf(m, "DirectMap%s:	%8lu kB\n",
>> +			size[PUD], dm_meminfo[PUD] >> 10);
> This seems a bit brittle to me. If somebody adds support for l1 block
> mappings for !4k pages in future, they will forget to update this and
> we'll end up returning kernel stack in /proc/meminfo afaict.

I can initialize size[PUD] to "NON_SUPPORT" by default. If the case 
happens, /proc/meminfo just shows "DirectMapNON_SUPPORT", then we will 
notice something is missed, but no kernel stack data will be leak.

>
>> +static inline bool is_dm_addr(unsigned long addr)
>> +{
>> +	return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
>> +}
>> +
>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +	if (is_dm_addr(addr))
>> +		dm_meminfo[type] += size;
>> +}
>> +
>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +	if (is_dm_addr(addr))
>> +		dm_meminfo[type] -= size;
>> +}
>> +#else
>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +}
>> +
>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long size,
>> +				  enum dm_type type)
>> +{
>> +}
>> +#endif
>> +
>>   static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>>   		     phys_addr_t phys, pgprot_t prot)
>>   {
>> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr,
>>   
>>   		init_pte(ptep, addr, next, phys, __prot);
>>   
>> +		if (pgprot_val(__prot) & PTE_CONT)
>> +			dm_meminfo_add(addr, (next - addr), CONT_PTE);
>> +		else
>> +			dm_meminfo_add(addr, (next - addr), PTE);
>> +
>>   		ptep += pte_index(next) - pte_index(addr);
>>   		phys += next - addr;
>>   	} while (addr = next, addr != end);
>> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>>   		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>>   			pmd_set_huge(pmdp, phys, prot);
>>   
>> +			/*
>> +			 * It is possible to have mappings allow cont mapping
>> +			 * but disallow block mapping. For example,
>> +			 * map_entry_trampoline().
>> +			 * So we have to increase CONT_PMD and PMD size here
>> +			 * to avoid double counting.
>> +			 */
>> +			if (pgprot_val(prot) & PTE_CONT)
>> +				dm_meminfo_add(addr, (next - addr), CONT_PMD);
>> +			else
>> +				dm_meminfo_add(addr, (next - addr), PMD);
> I don't understand the comment you're adding here. If somebody passes
> NO_BLOCK_MAPPINGS then that also prevents contiguous entries except at
> level 3.

The comment may be misleading. I meant if we have the accounting code 
for CONT_PMD in alloc_init_cont_pmd(), for example,

@@ -433,6 +433,11 @@ static int alloc_init_cont_pmd(pud_t *pudp, 
unsigned long addr,
                 if (ret)
                         goto out;

+               if (pgprot_val(prot) & PTE_CONT)
+                       dm_meminfo_add(addr, (next - addr), CONT_PMD);

                 pmdp += pmd_index(next) - pmd_index(addr);
                 phys += next - addr;
         } while (addr = next, addr != end);

If the described case happens, we actually miscount CONT_PMD. So I need 
to check whether it is CONT in init_pmd() instead. If the comment is 
confusing, I can just remove it.

> It also doesn't look you handle the error case properly when the mapping
> fails.

I don't quite get what fail do you mean? pmd_set_huge() doesn't fail. Or 
you meant hotplug fails? If so the hot unplug will decrease the 
counters, which is called in the error handling path.

>
>> -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>>   				    unsigned long end, bool free_mapped,
>>   				    struct vmem_altmap *altmap)
>>   {
>> -	pte_t *ptep, pte;
>> +	pte_t pte;
>>   
>>   	do {
>> -		ptep = pte_offset_kernel(pmdp, addr);
>>   		pte = __ptep_get(ptep);
>>   		if (pte_none(pte))
>>   			continue;
>>   
>>   		WARN_ON(!pte_present(pte));
>>   		__pte_clear(&init_mm, addr, ptep);
>> +		dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>>   		flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>   		if (free_mapped)
>>   			free_hotplug_page_range(pte_page(pte),
>>   						PAGE_SIZE, altmap);
> Is the existing code correct for contiguous entries here? I'd have
> thought that we'd need to make the range non-contiguous before knocking
> out the TLB.

Thanks for pointing this out. I didn't pay too much attention to such 
details to the existing code. Actually I did notice hot unplug code 
doesn't handle contiguous mappings, so I added 
unmap_hotplug_cont_{pmd|pte}_range() in this patch in order to maintain 
the counters correctly. I'm not sure it is intended (or maybe just 
unnecessary) or just an overlook.

You are concerned this may result in misprogramming issue? TBH, I'm a 
little bit confused by the "misprogramming contiguous bit" described in 
ARM ARM. In this case, the TLB flush should remove the large contiguous 
TLB entry, so there should be no overlapping entries in TLB. Or this is 
still a problem because some entries have contiguous bit set, but some 
don't? But when we change the contiguous bit for a range of entries, 
there is always a moment that some have contiguous bit set, but some 
don't. My understanding is it is fine as long as there is no overlapping 
entries in TLB. Anyway I may misunderstand it.

Thanks,
Yang

>
> Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Will Deacon 1 week, 4 days ago

On Tue, Jan 13, 2026 at 04:36:06PM -0800, Yang Shi wrote:
> On 1/13/26 6:36 AM, Will Deacon wrote:
> > On Tue, Jan 06, 2026 at 04:29:44PM -0800, Yang Shi wrote:
> > > +#if defined(CONFIG_ARM64_4K_PAGES)
> > > +	size[PTE] = "4k";
> > > +	size[CONT_PTE] = "64k";
> > > +	size[PMD] = "2M";
> > > +	size[CONT_PMD] = "32M";
> > > +	size[PUD] = "1G";
> > > +#elif defined(CONFIG_ARM64_16K_PAGES)
> > > +	size[PTE] = "16k";
> > > +	size[CONT_PTE] = "2M";
> > > +	size[PMD] = "32M";
> > > +	size[CONT_PMD] = "1G";
> > > +#elif defined(CONFIG_ARM64_64K_PAGES)
> > > +	size[PTE] = "64k";
> > > +	size[CONT_PTE] = "2M";
> > > +	size[PMD] = "512M";
> > > +	size[CONT_PMD] = "16G";
> > > +#endif
> > > +
> > > +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> > > +			size[PTE], dm_meminfo[PTE] >> 10);
> > > +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> > > +			size[CONT_PTE],
> > > +			dm_meminfo[CONT_PTE] >> 10);
> > > +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> > > +			size[PMD], dm_meminfo[PMD] >> 10);
> > > +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
> > > +			size[CONT_PMD],
> > > +			dm_meminfo[CONT_PMD] >> 10);
> > > +	if (pud_sect_supported())
> > > +		seq_printf(m, "DirectMap%s:	%8lu kB\n",
> > > +			size[PUD], dm_meminfo[PUD] >> 10);
> > This seems a bit brittle to me. If somebody adds support for l1 block
> > mappings for !4k pages in future, they will forget to update this and
> > we'll end up returning kernel stack in /proc/meminfo afaict.
> 
> I can initialize size[PUD] to "NON_SUPPORT" by default. If the case happens,
> /proc/meminfo just shows "DirectMapNON_SUPPORT", then we will notice
> something is missed, but no kernel stack data will be leak.

Or just add the PUD sizes for all the page sizes...

> > > @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
> > >   		    (flags & NO_BLOCK_MAPPINGS) == 0) {
> > >   			pmd_set_huge(pmdp, phys, prot);
> > > +			/*
> > > +			 * It is possible to have mappings allow cont mapping
> > > +			 * but disallow block mapping. For example,
> > > +			 * map_entry_trampoline().
> > > +			 * So we have to increase CONT_PMD and PMD size here
> > > +			 * to avoid double counting.
> > > +			 */
> > > +			if (pgprot_val(prot) & PTE_CONT)
> > > +				dm_meminfo_add(addr, (next - addr), CONT_PMD);
> > > +			else
> > > +				dm_meminfo_add(addr, (next - addr), PMD);
> > I don't understand the comment you're adding here. If somebody passes
> > NO_BLOCK_MAPPINGS then that also prevents contiguous entries except at
> > level 3.
> 
> The comment may be misleading. I meant if we have the accounting code for
> CONT_PMD in alloc_init_cont_pmd(), for example,

I think I'd just drop the comment. The code is clear enough once you
actually read what's going on.

> @@ -433,6 +433,11 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned
> long addr,
>                 if (ret)
>                         goto out;
> 
> +               if (pgprot_val(prot) & PTE_CONT)
> +                       dm_meminfo_add(addr, (next - addr), CONT_PMD);
> 
>                 pmdp += pmd_index(next) - pmd_index(addr);
>                 phys += next - addr;
>         } while (addr = next, addr != end);
> 
> If the described case happens, we actually miscount CONT_PMD. So I need to
> check whether it is CONT in init_pmd() instead. If the comment is confusing,
> I can just remove it.
> 
> > It also doesn't look you handle the error case properly when the mapping
> > fails.
> 
> I don't quite get what fail do you mean? pmd_set_huge() doesn't fail. Or you
> meant hotplug fails? If so the hot unplug will decrease the counters, which
> is called in the error handling path.

Sorry, I got confused here and thought that we could end up with a
partially-formed contiguous region but that's not the case. So you can
ignore this comment :)

Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 1 week, 4 days ago


On 1/26/26 6:18 AM, Will Deacon wrote:
> On Tue, Jan 13, 2026 at 04:36:06PM -0800, Yang Shi wrote:
>> On 1/13/26 6:36 AM, Will Deacon wrote:
>>> On Tue, Jan 06, 2026 at 04:29:44PM -0800, Yang Shi wrote:
>>>> +#if defined(CONFIG_ARM64_4K_PAGES)
>>>> +	size[PTE] = "4k";
>>>> +	size[CONT_PTE] = "64k";
>>>> +	size[PMD] = "2M";
>>>> +	size[CONT_PMD] = "32M";
>>>> +	size[PUD] = "1G";
>>>> +#elif defined(CONFIG_ARM64_16K_PAGES)
>>>> +	size[PTE] = "16k";
>>>> +	size[CONT_PTE] = "2M";
>>>> +	size[PMD] = "32M";
>>>> +	size[CONT_PMD] = "1G";
>>>> +#elif defined(CONFIG_ARM64_64K_PAGES)
>>>> +	size[PTE] = "64k";
>>>> +	size[CONT_PTE] = "2M";
>>>> +	size[PMD] = "512M";
>>>> +	size[CONT_PMD] = "16G";
>>>> +#endif
>>>> +
>>>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>>> +			size[PTE], dm_meminfo[PTE] >> 10);
>>>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>>> +			size[CONT_PTE],
>>>> +			dm_meminfo[CONT_PTE] >> 10);
>>>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>>> +			size[PMD], dm_meminfo[PMD] >> 10);
>>>> +	seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>>> +			size[CONT_PMD],
>>>> +			dm_meminfo[CONT_PMD] >> 10);
>>>> +	if (pud_sect_supported())
>>>> +		seq_printf(m, "DirectMap%s:	%8lu kB\n",
>>>> +			size[PUD], dm_meminfo[PUD] >> 10);
>>> This seems a bit brittle to me. If somebody adds support for l1 block
>>> mappings for !4k pages in future, they will forget to update this and
>>> we'll end up returning kernel stack in /proc/meminfo afaict.
>> I can initialize size[PUD] to "NON_SUPPORT" by default. If the case happens,
>> /proc/meminfo just shows "DirectMapNON_SUPPORT", then we will notice
>> something is missed, but no kernel stack data will be leak.
> Or just add the PUD sizes for all the page sizes...

Fine to me.

>
>>>> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>>>>    		    (flags & NO_BLOCK_MAPPINGS) == 0) {
>>>>    			pmd_set_huge(pmdp, phys, prot);
>>>> +			/*
>>>> +			 * It is possible to have mappings allow cont mapping
>>>> +			 * but disallow block mapping. For example,
>>>> +			 * map_entry_trampoline().
>>>> +			 * So we have to increase CONT_PMD and PMD size here
>>>> +			 * to avoid double counting.
>>>> +			 */
>>>> +			if (pgprot_val(prot) & PTE_CONT)
>>>> +				dm_meminfo_add(addr, (next - addr), CONT_PMD);
>>>> +			else
>>>> +				dm_meminfo_add(addr, (next - addr), PMD);
>>> I don't understand the comment you're adding here. If somebody passes
>>> NO_BLOCK_MAPPINGS then that also prevents contiguous entries except at
>>> level 3.
>> The comment may be misleading. I meant if we have the accounting code for
>> CONT_PMD in alloc_init_cont_pmd(), for example,
> I think I'd just drop the comment. The code is clear enough once you
> actually read what's going on.

Sure.

>
>> @@ -433,6 +433,11 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned
>> long addr,
>>                  if (ret)
>>                          goto out;
>>
>> +               if (pgprot_val(prot) & PTE_CONT)
>> +                       dm_meminfo_add(addr, (next - addr), CONT_PMD);
>>
>>                  pmdp += pmd_index(next) - pmd_index(addr);
>>                  phys += next - addr;
>>          } while (addr = next, addr != end);
>>
>> If the described case happens, we actually miscount CONT_PMD. So I need to
>> check whether it is CONT in init_pmd() instead. If the comment is confusing,
>> I can just remove it.
>>
>>> It also doesn't look you handle the error case properly when the mapping
>>> fails.
>> I don't quite get what fail do you mean? pmd_set_huge() doesn't fail. Or you
>> meant hotplug fails? If so the hot unplug will decrease the counters, which
>> is called in the error handling path.
> Sorry, I got confused here and thought that we could end up with a
> partially-formed contiguous region but that's not the case. So you can
> ignore this comment :)

No problem. Thanks for taking time to review the patch.

I will prepare a new revision once we figure out the potential 
contiguous bit misprogramming issue.

Thanks,
Yang

>
> Will

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Yang Shi 2 weeks, 3 days ago

Hi Will,

Gently ping, does the proposed change solve your concerns? If they do, I 
can send v6, hopefully it can be merged in the coming merge window.

Thanks,
Yang


On 1/13/26 4:36 PM, Yang Shi wrote:
>
>
> On 1/13/26 6:36 AM, Will Deacon wrote:
>> On Tue, Jan 06, 2026 at 04:29:44PM -0800, Yang Shi wrote:
>>> Since commit a166563e7ec3 ("arm64: mm: support large block mapping when
>>> rodata=full"), the direct mapping may be split on some machines instead
>>> keeping static since boot. It makes more sense to show the direct 
>>> mapping
>>> use in /proc/meminfo than before.
>>> This patch will make /proc/meminfo show the direct mapping use like the
>>> below (4K base page size):
>>> DirectMap4K:       94792 kB
>>> DirectMap64K:      134208 kB
>>> DirectMap2M:     1173504 kB
>>> DirectMap32M:     5636096 kB
>>> DirectMap1G:    529530880 kB
>>>
>>> Although just the machines which support BBML2_NOABORT can split the
>>> direct mapping, show it on all machines regardless of BBML2_NOABORT so
>>> that the users have consistent view in order to avoid confusion.
>>>
>>> Although ptdump also can tell the direct map use, but it needs to dump
>>> the whole kernel page table. It is costly and overkilling. It is also
>>> in debugfs which may not be enabled by all distros. So showing direct
>>> map use in /proc/meminfo seems more convenient and has less overhead.
>>>
>>> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
>>> ---
>>> v5: * Rebased to v6.19-rc4
>>>      * Fixed the build error for !CONFIG_PROC_FS
>>> v4: * Used PAGE_END instead of _PAGE_END(VA_BITS_MIN) per Ryan
>>>      * Used shorter name for the helpers and variables per Ryan
>>>      * Fixed accounting for memory hotunplug
>>> v3: * Fixed the over-accounting problems per Ryan
>>>      * Introduced helpers for add/sub direct map use and #ifdef them 
>>> with
>>>        CONFIG_PROC_FS per Ryan
>>>      * v3 is a fix patch on top of v2
>>> v2: * Counted in size instead of the number of entries per Ryan
>>>      * Removed shift array per Ryan
>>>      * Use lower case "k" per Ryan
>>>      * Fixed a couple of build warnings reported by kernel test robot
>>>      * Fixed a couple of poential miscounts
>>>
>>>   arch/arm64/mm/mmu.c | 202 
>>> +++++++++++++++++++++++++++++++++++++++-----
>>>   1 file changed, 181 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>>> index 8e1d80a7033e..422441c9a992 100644
>>> --- a/arch/arm64/mm/mmu.c
>>> +++ b/arch/arm64/mm/mmu.c
>>> @@ -29,6 +29,7 @@
>>>   #include <linux/mm_inline.h>
>>>   #include <linux/pagewalk.h>
>>>   #include <linux/stop_machine.h>
>>> +#include <linux/proc_fs.h>
>>>     #include <asm/barrier.h>
>>>   #include <asm/cputype.h>
>>> @@ -171,6 +172,85 @@ static void init_clear_pgtable(void *table)
>>>       dsb(ishst);
>>>   }
>>>   +enum dm_type {
>>> +    PTE,
>>> +    CONT_PTE,
>>> +    PMD,
>>> +    CONT_PMD,
>>> +    PUD,
>>> +    NR_DM_TYPE,
>>> +};
>>> +
>>> +#ifdef CONFIG_PROC_FS
>>> +static unsigned long dm_meminfo[NR_DM_TYPE];
>>> +
>>> +void arch_report_meminfo(struct seq_file *m)
>>> +{
>>> +    char *size[NR_DM_TYPE];
>> const?
>
> Yeah, it can be const.
>
>>
>>> +
>>> +#if defined(CONFIG_ARM64_4K_PAGES)
>>> +    size[PTE] = "4k";
>>> +    size[CONT_PTE] = "64k";
>>> +    size[PMD] = "2M";
>>> +    size[CONT_PMD] = "32M";
>>> +    size[PUD] = "1G";
>>> +#elif defined(CONFIG_ARM64_16K_PAGES)
>>> +    size[PTE] = "16k";
>>> +    size[CONT_PTE] = "2M";
>>> +    size[PMD] = "32M";
>>> +    size[CONT_PMD] = "1G";
>>> +#elif defined(CONFIG_ARM64_64K_PAGES)
>>> +    size[PTE] = "64k";
>>> +    size[CONT_PTE] = "2M";
>>> +    size[PMD] = "512M";
>>> +    size[CONT_PMD] = "16G";
>>> +#endif
>>> +
>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>> +            size[PTE], dm_meminfo[PTE] >> 10);
>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>> +            size[CONT_PTE],
>>> +            dm_meminfo[CONT_PTE] >> 10);
>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>> +            size[PMD], dm_meminfo[PMD] >> 10);
>>> +    seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>> +            size[CONT_PMD],
>>> +            dm_meminfo[CONT_PMD] >> 10);
>>> +    if (pud_sect_supported())
>>> +        seq_printf(m, "DirectMap%s:    %8lu kB\n",
>>> +            size[PUD], dm_meminfo[PUD] >> 10);
>> This seems a bit brittle to me. If somebody adds support for l1 block
>> mappings for !4k pages in future, they will forget to update this and
>> we'll end up returning kernel stack in /proc/meminfo afaict.
>
> I can initialize size[PUD] to "NON_SUPPORT" by default. If the case 
> happens, /proc/meminfo just shows "DirectMapNON_SUPPORT", then we will 
> notice something is missed, but no kernel stack data will be leak.
>
>>
>>> +static inline bool is_dm_addr(unsigned long addr)
>>> +{
>>> +    return (addr >= PAGE_OFFSET) && (addr < PAGE_END);
>>> +}
>>> +
>>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long 
>>> size,
>>> +                  enum dm_type type)
>>> +{
>>> +    if (is_dm_addr(addr))
>>> +        dm_meminfo[type] += size;
>>> +}
>>> +
>>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long 
>>> size,
>>> +                  enum dm_type type)
>>> +{
>>> +    if (is_dm_addr(addr))
>>> +        dm_meminfo[type] -= size;
>>> +}
>>> +#else
>>> +static inline void dm_meminfo_add(unsigned long addr, unsigned long 
>>> size,
>>> +                  enum dm_type type)
>>> +{
>>> +}
>>> +
>>> +static inline void dm_meminfo_sub(unsigned long addr, unsigned long 
>>> size,
>>> +                  enum dm_type type)
>>> +{
>>> +}
>>> +#endif
>>> +
>>>   static void init_pte(pte_t *ptep, unsigned long addr, unsigned 
>>> long end,
>>>                phys_addr_t phys, pgprot_t prot)
>>>   {
>>> @@ -236,6 +316,11 @@ static int alloc_init_cont_pte(pmd_t *pmdp, 
>>> unsigned long addr,
>>>             init_pte(ptep, addr, next, phys, __prot);
>>>   +        if (pgprot_val(__prot) & PTE_CONT)
>>> +            dm_meminfo_add(addr, (next - addr), CONT_PTE);
>>> +        else
>>> +            dm_meminfo_add(addr, (next - addr), PTE);
>>> +
>>>           ptep += pte_index(next) - pte_index(addr);
>>>           phys += next - addr;
>>>       } while (addr = next, addr != end);
>>> @@ -266,6 +351,17 @@ static int init_pmd(pmd_t *pmdp, unsigned long 
>>> addr, unsigned long end,
>>>               (flags & NO_BLOCK_MAPPINGS) == 0) {
>>>               pmd_set_huge(pmdp, phys, prot);
>>>   +            /*
>>> +             * It is possible to have mappings allow cont mapping
>>> +             * but disallow block mapping. For example,
>>> +             * map_entry_trampoline().
>>> +             * So we have to increase CONT_PMD and PMD size here
>>> +             * to avoid double counting.
>>> +             */
>>> +            if (pgprot_val(prot) & PTE_CONT)
>>> +                dm_meminfo_add(addr, (next - addr), CONT_PMD);
>>> +            else
>>> +                dm_meminfo_add(addr, (next - addr), PMD);
>> I don't understand the comment you're adding here. If somebody passes
>> NO_BLOCK_MAPPINGS then that also prevents contiguous entries except at
>> level 3.
>
> The comment may be misleading. I meant if we have the accounting code 
> for CONT_PMD in alloc_init_cont_pmd(), for example,
>
> @@ -433,6 +433,11 @@ static int alloc_init_cont_pmd(pud_t *pudp, 
> unsigned long addr,
>                 if (ret)
>                         goto out;
>
> +               if (pgprot_val(prot) & PTE_CONT)
> +                       dm_meminfo_add(addr, (next - addr), CONT_PMD);
>
>                 pmdp += pmd_index(next) - pmd_index(addr);
>                 phys += next - addr;
>         } while (addr = next, addr != end);
>
> If the described case happens, we actually miscount CONT_PMD. So I 
> need to check whether it is CONT in init_pmd() instead. If the comment 
> is confusing, I can just remove it.
>
>> It also doesn't look you handle the error case properly when the mapping
>> fails.
>
> I don't quite get what fail do you mean? pmd_set_huge() doesn't fail. 
> Or you meant hotplug fails? If so the hot unplug will decrease the 
> counters, which is called in the error handling path.
>
>>
>>> -static void unmap_hotplug_pte_range(pmd_t *pmdp, unsigned long addr,
>>> +static void unmap_hotplug_pte_range(pte_t *ptep, unsigned long addr,
>>>                       unsigned long end, bool free_mapped,
>>>                       struct vmem_altmap *altmap)
>>>   {
>>> -    pte_t *ptep, pte;
>>> +    pte_t pte;
>>>         do {
>>> -        ptep = pte_offset_kernel(pmdp, addr);
>>>           pte = __ptep_get(ptep);
>>>           if (pte_none(pte))
>>>               continue;
>>>             WARN_ON(!pte_present(pte));
>>>           __pte_clear(&init_mm, addr, ptep);
>>> +        dm_meminfo_sub(addr, PAGE_SIZE, PTE);
>>>           flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
>>>           if (free_mapped)
>>>               free_hotplug_page_range(pte_page(pte),
>>>                           PAGE_SIZE, altmap);
>> Is the existing code correct for contiguous entries here? I'd have
>> thought that we'd need to make the range non-contiguous before knocking
>> out the TLB.
>
> Thanks for pointing this out. I didn't pay too much attention to such 
> details to the existing code. Actually I did notice hot unplug code 
> doesn't handle contiguous mappings, so I added 
> unmap_hotplug_cont_{pmd|pte}_range() in this patch in order to 
> maintain the counters correctly. I'm not sure it is intended (or maybe 
> just unnecessary) or just an overlook.
>
> You are concerned this may result in misprogramming issue? TBH, I'm a 
> little bit confused by the "misprogramming contiguous bit" described 
> in ARM ARM. In this case, the TLB flush should remove the large 
> contiguous TLB entry, so there should be no overlapping entries in 
> TLB. Or this is still a problem because some entries have contiguous 
> bit set, but some don't? But when we change the contiguous bit for a 
> range of entries, there is always a moment that some have contiguous 
> bit set, but some don't. My understanding is it is fine as long as 
> there is no overlapping entries in TLB. Anyway I may misunderstand it.
>
> Thanks,
> Yang
>
>>
>> Will
>

Re: [v5 PATCH] arm64: mm: show direct mapping use in /proc/meminfo

Posted by Christoph Lameter (Ampere) 1 month ago

LGTM

Reviewed-by: Christoph Lameter (Ampere) <cl@gentwo.org>