[v3] arm64: support FEAT_BBM level 2 and large block mapping when rodata=full

[v3 PATCH 4/6] arm64: mm: support large block mapping when rodata=full

Posted by Yang Shi 11 months, 1 week ago

When rodata=full is specified, kernel linear mapping has to be mapped at
PTE level since large page table can't be split due to break-before-make
rule on ARM64.

This resulted in a couple of problems:
  - performance degradation
  - more TLB pressure
  - memory waste for kernel page table

With FEAT_BBM level 2 support, splitting large block page table to
smaller ones doesn't need to make the page table entry invalid anymore.
This allows kernel split large block mapping on the fly.

Add kernel page table split support and use large block mapping by
default when FEAT_BBM level 2 is supported for rodata=full.  When
changing permissions for kernel linear mapping, the page table will be
split to PTE level.

The machine without FEAT_BBM level 2 will fallback to have kernel linear
mapping PTE-mapped when rodata=full.

With this we saw significant performance boost with some benchmarks and
much less memory consumption on my AmpereOne machine (192 cores, 1P) with
256GB memory.

* Memory use after boot
Before:
MemTotal:       258988984 kB
MemFree:        254821700 kB

After:
MemTotal:       259505132 kB
MemFree:        255410264 kB

Around 500MB more memory are free to use.  The larger the machine, the
more memory saved.

* Memcached
We saw performance degradation when running Memcached benchmark with
rodata=full vs rodata=on.  Our profiling pointed to kernel TLB pressure.
With this patchset we saw ops/sec is increased by around 3.5%, P99
latency is reduced by around 9.6%.
The gain mainly came from reduced kernel TLB misses.  The kernel TLB
MPKI is reduced by 28.5%.

The benchmark data is now on par with rodata=on too.

* Disk encryption (dm-crypt) benchmark
Ran fio benchmark with the below command on a 128G ramdisk (ext4) with disk
encryption (by dm-crypt).
fio --directory=/data --random_generator=lfsr --norandommap --randrepeat 1 \
    --status-interval=999 --rw=write --bs=4k --loops=1 --ioengine=sync \
    --iodepth=1 --numjobs=1 --fsync_on_close=1 --group_reporting --thread \
    --name=iops-test-job --eta-newline=1 --size 100G

The IOPS is increased by 90% - 150% (the variance is high, but the worst
number of good case is around 90% more than the best number of bad case).
The bandwidth is increased and the avg clat is reduced proportionally.

* Sequential file read
Read 100G file sequentially on XFS (xfs_io read with page cache populated).
The bandwidth is increased by 150%.

Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
---
 arch/arm64/include/asm/cpufeature.h |  10 ++
 arch/arm64/include/asm/mmu.h        |   1 +
 arch/arm64/include/asm/pgtable.h    |   7 +-
 arch/arm64/kernel/cpufeature.c      |   2 +-
 arch/arm64/mm/mmu.c                 | 169 +++++++++++++++++++++++++++-
 arch/arm64/mm/pageattr.c            |  35 +++++-
 6 files changed, 211 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 108ef3fbbc00..e24edc32b0bd 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -871,6 +871,16 @@ static inline bool system_supports_bbml2_noabort(void)
 	return alternative_has_cap_unlikely(ARM64_HAS_BBML2_NOABORT);
 }
 
+bool cpu_has_bbml2_noabort(unsigned int cpu_midr);
+/*
+ * Called at early boot stage on boot CPU before cpu info and cpu feature
+ * are ready.
+ */
+static inline bool bbml2_noabort_available(void)
+{
+	return cpu_has_bbml2_noabort(read_cpuid_id());
+}
+
 int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
 bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
 
diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
index 662471cfc536..d658a33df266 100644
--- a/arch/arm64/include/asm/mmu.h
+++ b/arch/arm64/include/asm/mmu.h
@@ -71,6 +71,7 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
 			       pgprot_t prot, bool page_mappings_only);
 extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
 extern void mark_linear_text_alias_ro(void);
+extern int split_linear_mapping(unsigned long start, unsigned long end);
 
 /*
  * This check is triggered during the early boot before the cpufeature
diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 0b2a2ad1b9e8..ed2fc1dcf7ae 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -749,7 +749,7 @@ static inline bool in_swapper_pgdir(void *addr)
 	        ((unsigned long)swapper_pg_dir & PAGE_MASK);
 }
 
-static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
+static inline void __set_pmd_nosync(pmd_t *pmdp, pmd_t pmd)
 {
 #ifdef __PAGETABLE_PMD_FOLDED
 	if (in_swapper_pgdir(pmdp)) {
@@ -759,6 +759,11 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 	WRITE_ONCE(*pmdp, pmd);
+}
+
+static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
+{
+	__set_pmd_nosync(pmdp, pmd);
 
 	if (pmd_valid(pmd)) {
 		dsb(ishst);
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index bf3df8407ca3..d39637d5aeab 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2176,7 +2176,7 @@ static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
 	return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE);
 }
 
-static bool cpu_has_bbml2_noabort(unsigned int cpu_midr)
+bool cpu_has_bbml2_noabort(unsigned int cpu_midr)
 {
 	/* We want to allow usage of bbml2 in as wide a range of kernel contexts
 	 * as possible. This list is therefore an allow-list of known-good
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index dccf0877285b..ad0f1cc55e3a 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -45,6 +45,7 @@
 #define NO_BLOCK_MAPPINGS	BIT(0)
 #define NO_CONT_MAPPINGS	BIT(1)
 #define NO_EXEC_MAPPINGS	BIT(2)	/* assumes FEAT_HPDS is not used */
+#define SPLIT_MAPPINGS		BIT(3)
 
 u64 kimage_voffset __ro_after_init;
 EXPORT_SYMBOL(kimage_voffset);
@@ -166,6 +167,73 @@ static void init_clear_pgtable(void *table)
 	dsb(ishst);
 }
 
+static int split_pmd(pmd_t *pmdp, pmd_t pmdval,
+		     phys_addr_t (*pgtable_alloc)(int))
+{
+	unsigned long pfn;
+	pgprot_t prot;
+	phys_addr_t pte_phys;
+	pte_t *ptep;
+
+	if (!pmd_leaf(pmdval))
+		return 0;
+
+	pfn = pmd_pfn(pmdval);
+	prot = pmd_pgprot(pmdval);
+
+	pte_phys = pgtable_alloc(PAGE_SHIFT);
+	if (!pte_phys)
+		return -ENOMEM;
+
+	ptep = (pte_t *)phys_to_virt(pte_phys);
+	init_clear_pgtable(ptep);
+	prot = __pgprot(pgprot_val(prot) | PTE_TYPE_PAGE);
+	for (int i = 0; i < PTRS_PER_PTE; i++, ptep++)
+		__set_pte_nosync(ptep, pfn_pte(pfn + i, prot));
+
+	dsb(ishst);
+
+	set_pmd(pmdp, pfn_pmd(__phys_to_pfn(pte_phys),
+		__pgprot(PMD_TYPE_TABLE)));
+
+	return 0;
+}
+
+static int split_pud(pud_t *pudp, pud_t pudval,
+		     phys_addr_t (*pgtable_alloc)(int))
+{
+	unsigned long pfn;
+	pgprot_t prot;
+	pmd_t *pmdp;
+	phys_addr_t pmd_phys;
+	unsigned int step;
+
+	if (!pud_leaf(pudval))
+		return 0;
+
+	pfn = pud_pfn(pudval);
+	prot = pud_pgprot(pudval);
+	step = PMD_SIZE >> PAGE_SHIFT;
+
+	pmd_phys = pgtable_alloc(PMD_SHIFT);
+	if (!pmd_phys)
+		return -ENOMEM;
+
+	pmdp = (pmd_t *)phys_to_virt(pmd_phys);
+	init_clear_pgtable(pmdp);
+	for (int i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
+		__set_pmd_nosync(pmdp, pfn_pmd(pfn, prot));
+		pfn += step;
+	}
+
+	dsb(ishst);
+
+	set_pud(pudp, pfn_pud(__phys_to_pfn(pmd_phys),
+		__pgprot(PUD_TYPE_TABLE)));
+
+	return 0;
+}
+
 static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
 		     phys_addr_t phys, pgprot_t prot)
 {
@@ -251,12 +319,21 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
 {
 	unsigned long next;
 	int ret = 0;
+	bool split = flags & SPLIT_MAPPINGS;
 
 	do {
 		pmd_t old_pmd = READ_ONCE(*pmdp);
 
 		next = pmd_addr_end(addr, end);
 
+		if (split) {
+			ret = split_pmd(pmdp, old_pmd, pgtable_alloc);
+			if (ret)
+				break;
+
+			continue;
+		}
+
 		/* try section mapping first */
 		if (((addr | next | phys) & ~PMD_MASK) == 0 &&
 		    (flags & NO_BLOCK_MAPPINGS) == 0) {
@@ -292,11 +369,19 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 	int ret = 0;
 	pud_t pud = READ_ONCE(*pudp);
 	pmd_t *pmdp;
+	bool split = flags & SPLIT_MAPPINGS;
 
 	/*
 	 * Check for initial section mappings in the pgd/pud.
 	 */
 	BUG_ON(pud_sect(pud));
+
+	if (split) {
+		BUG_ON(pud_none(pud));
+		pmdp = pmd_offset(pudp, addr);
+		goto split_pgtable;
+	}
+
 	if (pud_none(pud)) {
 		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
 		phys_addr_t pmd_phys;
@@ -316,6 +401,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 		pmdp = pmd_set_fixmap_offset(pudp, addr);
 	}
 
+split_pgtable:
 	do {
 		pgprot_t __prot = prot;
 
@@ -334,7 +420,8 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
 		phys += next - addr;
 	} while (addr = next, addr != end);
 
-	pmd_clear_fixmap();
+	if (!split)
+		pmd_clear_fixmap();
 
 	return ret;
 }
@@ -348,6 +435,13 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 	int ret = 0;
 	p4d_t p4d = READ_ONCE(*p4dp);
 	pud_t *pudp;
+	bool split = flags & SPLIT_MAPPINGS;
+
+	if (split) {
+		BUG_ON(p4d_none(p4d));
+		pudp = pud_offset(p4dp, addr);
+		goto split_pgtable;
+	}
 
 	if (p4d_none(p4d)) {
 		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF;
@@ -368,11 +462,25 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 		pudp = pud_set_fixmap_offset(p4dp, addr);
 	}
 
+split_pgtable:
 	do {
 		pud_t old_pud = READ_ONCE(*pudp);
 
 		next = pud_addr_end(addr, end);
 
+		if (split) {
+			ret = split_pud(pudp, old_pud, pgtable_alloc);
+			if (ret)
+				break;
+
+			ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
+						  pgtable_alloc, flags);
+			if (ret)
+				break;
+
+			continue;
+		}
+
 		/*
 		 * For 4K granule only, attempt to put down a 1GB block
 		 */
@@ -399,7 +507,8 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
 		phys += next - addr;
 	} while (pudp++, addr = next, addr != end);
 
-	pud_clear_fixmap();
+	if (!split)
+		pud_clear_fixmap();
 
 	return ret;
 }
@@ -413,6 +522,13 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 	int ret = 0;
 	pgd_t pgd = READ_ONCE(*pgdp);
 	p4d_t *p4dp;
+	bool split = flags & SPLIT_MAPPINGS;
+
+	if (split) {
+		BUG_ON(pgd_none(pgd));
+		p4dp = p4d_offset(pgdp, addr);
+		goto split_pgtable;
+	}
 
 	if (pgd_none(pgd)) {
 		pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF;
@@ -433,6 +549,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 		p4dp = p4d_set_fixmap_offset(pgdp, addr);
 	}
 
+split_pgtable:
 	do {
 		p4d_t old_p4d = READ_ONCE(*p4dp);
 
@@ -449,7 +566,8 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
 		phys += next - addr;
 	} while (p4dp++, addr = next, addr != end);
 
-	p4d_clear_fixmap();
+	if (!split)
+		p4d_clear_fixmap();
 
 	return ret;
 }
@@ -546,6 +664,23 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
 	return pa;
 }
 
+int split_linear_mapping(unsigned long start, unsigned long end)
+{
+	int ret = 0;
+
+	if (!system_supports_bbml2_noabort())
+		return 0;
+
+	mmap_write_lock(&init_mm);
+	ret = __create_pgd_mapping_locked(init_mm.pgd, virt_to_phys((void *)start),
+					  start, (end - start), __pgprot(0),
+					  __pgd_pgtable_alloc, SPLIT_MAPPINGS);
+	mmap_write_unlock(&init_mm);
+	flush_tlb_kernel_range(start, end);
+
+	return ret;
+}
+
 /*
  * This function can only be used to modify existing table entries,
  * without allocating new levels of table. Note that this permits the
@@ -665,6 +800,24 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
 
 #endif /* CONFIG_KFENCE */
 
+static inline bool force_pte_mapping(void)
+{
+	/*
+	 * Can't use cpufeature API to determine whether BBML2 supported
+	 * or not since cpufeature have not been finalized yet.
+	 *
+	 * Checking the boot CPU only for now.  If the boot CPU has
+	 * BBML2, paint linear mapping with block mapping.  If it turns
+	 * out the secondary CPUs don't support BBML2 once cpufeature is
+	 * fininalized, the linear mapping will be repainted with PTE
+	 * mapping.
+	 */
+	return (rodata_full && !bbml2_noabort_available()) ||
+		debug_pagealloc_enabled() ||
+		arm64_kfence_can_set_direct_map() ||
+		is_realm_world();
+}
+
 static void __init map_mem(pgd_t *pgdp)
 {
 	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
@@ -690,9 +843,12 @@ static void __init map_mem(pgd_t *pgdp)
 
 	early_kfence_pool = arm64_kfence_alloc_pool();
 
-	if (can_set_direct_map())
+	if (force_pte_mapping())
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
+	if (rodata_full)
+		flags |= NO_CONT_MAPPINGS;
+
 	/*
 	 * Take care not to create a writable alias for the
 	 * read-only text and rodata sections of the kernel image.
@@ -1388,9 +1544,12 @@ int arch_add_memory(int nid, u64 start, u64 size,
 
 	VM_BUG_ON(!mhp_range_allowed(start, size, true));
 
-	if (can_set_direct_map())
+	if (force_pte_mapping())
 		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
+	if (rodata_full)
+		flags |= NO_CONT_MAPPINGS;
+
 	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
 			     size, params->pgprot, __pgd_pgtable_alloc,
 			     flags);
diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
index 39fd1f7ff02a..5d42d87ea7e1 100644
--- a/arch/arm64/mm/pageattr.c
+++ b/arch/arm64/mm/pageattr.c
@@ -10,6 +10,7 @@
 #include <linux/vmalloc.h>
 
 #include <asm/cacheflush.h>
+#include <asm/mmu.h>
 #include <asm/pgtable-prot.h>
 #include <asm/set_memory.h>
 #include <asm/tlbflush.h>
@@ -80,8 +81,9 @@ static int change_memory_common(unsigned long addr, int numpages,
 	unsigned long start = addr;
 	unsigned long size = PAGE_SIZE * numpages;
 	unsigned long end = start + size;
+	unsigned long l_start;
 	struct vm_struct *area;
-	int i;
+	int i, ret;
 
 	if (!PAGE_ALIGNED(addr)) {
 		start &= PAGE_MASK;
@@ -118,7 +120,12 @@ static int change_memory_common(unsigned long addr, int numpages,
 	if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
 			    pgprot_val(clear_mask) == PTE_RDONLY)) {
 		for (i = 0; i < area->nr_pages; i++) {
-			__change_memory_common((u64)page_address(area->pages[i]),
+			l_start = (u64)page_address(area->pages[i]);
+			ret = split_linear_mapping(l_start, l_start + PAGE_SIZE);
+			if (WARN_ON_ONCE(ret))
+				return ret;
+
+			__change_memory_common(l_start,
 					       PAGE_SIZE, set_mask, clear_mask);
 		}
 	}
@@ -174,6 +181,9 @@ int set_memory_valid(unsigned long addr, int numpages, int enable)
 
 int set_direct_map_invalid_noflush(struct page *page)
 {
+	unsigned long l_start;
+	int ret;
+
 	struct page_change_data data = {
 		.set_mask = __pgprot(0),
 		.clear_mask = __pgprot(PTE_VALID),
@@ -182,13 +192,21 @@ int set_direct_map_invalid_noflush(struct page *page)
 	if (!can_set_direct_map())
 		return 0;
 
+	l_start = (unsigned long)page_address(page);
+	ret = split_linear_mapping(l_start, l_start + PAGE_SIZE);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
 	return apply_to_page_range(&init_mm,
-				   (unsigned long)page_address(page),
-				   PAGE_SIZE, change_page_range, &data);
+				   l_start, PAGE_SIZE, change_page_range,
+				   &data);
 }
 
 int set_direct_map_default_noflush(struct page *page)
 {
+	unsigned long l_start;
+	int ret;
+
 	struct page_change_data data = {
 		.set_mask = __pgprot(PTE_VALID | PTE_WRITE),
 		.clear_mask = __pgprot(PTE_RDONLY),
@@ -197,9 +215,14 @@ int set_direct_map_default_noflush(struct page *page)
 	if (!can_set_direct_map())
 		return 0;
 
+	l_start = (unsigned long)page_address(page);
+	ret = split_linear_mapping(l_start, l_start + PAGE_SIZE);
+	if (WARN_ON_ONCE(ret))
+		return ret;
+
 	return apply_to_page_range(&init_mm,
-				   (unsigned long)page_address(page),
-				   PAGE_SIZE, change_page_range, &data);
+				   l_start, PAGE_SIZE, change_page_range,
+				   &data);
 }
 
 static int __set_memory_enc_dec(unsigned long addr,
-- 
2.47.1

Re: [v3 PATCH 4/6] arm64: mm: support large block mapping when rodata=full

Posted by Ryan Roberts 11 months ago

On 04/03/2025 22:19, Yang Shi wrote:
> When rodata=full is specified, kernel linear mapping has to be mapped at
> PTE level since large page table can't be split due to break-before-make
> rule on ARM64.
> 
> This resulted in a couple of problems:
>   - performance degradation
>   - more TLB pressure
>   - memory waste for kernel page table
> 
> With FEAT_BBM level 2 support, splitting large block page table to
> smaller ones doesn't need to make the page table entry invalid anymore.
> This allows kernel split large block mapping on the fly.
> 
> Add kernel page table split support and use large block mapping by
> default when FEAT_BBM level 2 is supported for rodata=full.  When
> changing permissions for kernel linear mapping, the page table will be
> split to PTE level.
> 
> The machine without FEAT_BBM level 2 will fallback to have kernel linear
> mapping PTE-mapped when rodata=full.
> 
> With this we saw significant performance boost with some benchmarks and
> much less memory consumption on my AmpereOne machine (192 cores, 1P) with
> 256GB memory.
> 
> * Memory use after boot
> Before:
> MemTotal:       258988984 kB
> MemFree:        254821700 kB
> 
> After:
> MemTotal:       259505132 kB
> MemFree:        255410264 kB
> 
> Around 500MB more memory are free to use.  The larger the machine, the
> more memory saved.
> 
> * Memcached
> We saw performance degradation when running Memcached benchmark with
> rodata=full vs rodata=on.  Our profiling pointed to kernel TLB pressure.
> With this patchset we saw ops/sec is increased by around 3.5%, P99
> latency is reduced by around 9.6%.
> The gain mainly came from reduced kernel TLB misses.  The kernel TLB
> MPKI is reduced by 28.5%.
> 
> The benchmark data is now on par with rodata=on too.
> 
> * Disk encryption (dm-crypt) benchmark
> Ran fio benchmark with the below command on a 128G ramdisk (ext4) with disk
> encryption (by dm-crypt).
> fio --directory=/data --random_generator=lfsr --norandommap --randrepeat 1 \
>     --status-interval=999 --rw=write --bs=4k --loops=1 --ioengine=sync \
>     --iodepth=1 --numjobs=1 --fsync_on_close=1 --group_reporting --thread \
>     --name=iops-test-job --eta-newline=1 --size 100G
> 
> The IOPS is increased by 90% - 150% (the variance is high, but the worst
> number of good case is around 90% more than the best number of bad case).
> The bandwidth is increased and the avg clat is reduced proportionally.
> 
> * Sequential file read
> Read 100G file sequentially on XFS (xfs_io read with page cache populated).
> The bandwidth is increased by 150%.
> 
> Signed-off-by: Yang Shi <yang@os.amperecomputing.com>
> ---
>  arch/arm64/include/asm/cpufeature.h |  10 ++
>  arch/arm64/include/asm/mmu.h        |   1 +
>  arch/arm64/include/asm/pgtable.h    |   7 +-
>  arch/arm64/kernel/cpufeature.c      |   2 +-
>  arch/arm64/mm/mmu.c                 | 169 +++++++++++++++++++++++++++-
>  arch/arm64/mm/pageattr.c            |  35 +++++-
>  6 files changed, 211 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
> index 108ef3fbbc00..e24edc32b0bd 100644
> --- a/arch/arm64/include/asm/cpufeature.h
> +++ b/arch/arm64/include/asm/cpufeature.h
> @@ -871,6 +871,16 @@ static inline bool system_supports_bbml2_noabort(void)
>  	return alternative_has_cap_unlikely(ARM64_HAS_BBML2_NOABORT);
>  }
>  
> +bool cpu_has_bbml2_noabort(unsigned int cpu_midr);
> +/*
> + * Called at early boot stage on boot CPU before cpu info and cpu feature
> + * are ready.
> + */
> +static inline bool bbml2_noabort_available(void)
> +{
> +	return cpu_has_bbml2_noabort(read_cpuid_id());

You'll want to incorporate the IS_ENABLED(CONFIG_ARM64_BBML2_NOABORT) and
arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOBBML2) checks from
Miko's new series to avoid block mappings when BBML2 is disabled. (that second
check will change a bit based on Maz's feedback against Miko's v3).

Hopefully we can factor out into a common helper that is used by Miko's stuff too?

> +}
> +
>  int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
>  bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
>  
> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
> index 662471cfc536..d658a33df266 100644
> --- a/arch/arm64/include/asm/mmu.h
> +++ b/arch/arm64/include/asm/mmu.h
> @@ -71,6 +71,7 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
>  			       pgprot_t prot, bool page_mappings_only);
>  extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
>  extern void mark_linear_text_alias_ro(void);
> +extern int split_linear_mapping(unsigned long start, unsigned long end);
>  
>  /*
>   * This check is triggered during the early boot before the cpufeature
> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
> index 0b2a2ad1b9e8..ed2fc1dcf7ae 100644
> --- a/arch/arm64/include/asm/pgtable.h
> +++ b/arch/arm64/include/asm/pgtable.h
> @@ -749,7 +749,7 @@ static inline bool in_swapper_pgdir(void *addr)
>  	        ((unsigned long)swapper_pg_dir & PAGE_MASK);
>  }
>  
> -static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
> +static inline void __set_pmd_nosync(pmd_t *pmdp, pmd_t pmd)
>  {
>  #ifdef __PAGETABLE_PMD_FOLDED
>  	if (in_swapper_pgdir(pmdp)) {
> @@ -759,6 +759,11 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>  #endif /* __PAGETABLE_PMD_FOLDED */
>  
>  	WRITE_ONCE(*pmdp, pmd);
> +}
> +
> +static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
> +{
> +	__set_pmd_nosync(pmdp, pmd);
>  
>  	if (pmd_valid(pmd)) {
>  		dsb(ishst);
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index bf3df8407ca3..d39637d5aeab 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -2176,7 +2176,7 @@ static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
>  	return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE);
>  }
>  
> -static bool cpu_has_bbml2_noabort(unsigned int cpu_midr)
> +bool cpu_has_bbml2_noabort(unsigned int cpu_midr)
>  {
>  	/* We want to allow usage of bbml2 in as wide a range of kernel contexts
>  	 * as possible. This list is therefore an allow-list of known-good
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index dccf0877285b..ad0f1cc55e3a 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -45,6 +45,7 @@
>  #define NO_BLOCK_MAPPINGS	BIT(0)
>  #define NO_CONT_MAPPINGS	BIT(1)
>  #define NO_EXEC_MAPPINGS	BIT(2)	/* assumes FEAT_HPDS is not used */
> +#define SPLIT_MAPPINGS		BIT(3)
>  
>  u64 kimage_voffset __ro_after_init;
>  EXPORT_SYMBOL(kimage_voffset);
> @@ -166,6 +167,73 @@ static void init_clear_pgtable(void *table)
>  	dsb(ishst);
>  }
>  
> +static int split_pmd(pmd_t *pmdp, pmd_t pmdval,
> +		     phys_addr_t (*pgtable_alloc)(int))
> +{
> +	unsigned long pfn;
> +	pgprot_t prot;
> +	phys_addr_t pte_phys;
> +	pte_t *ptep;
> +
> +	if (!pmd_leaf(pmdval))
> +		return 0;
> +
> +	pfn = pmd_pfn(pmdval);
> +	prot = pmd_pgprot(pmdval);
> +
> +	pte_phys = pgtable_alloc(PAGE_SHIFT);
> +	if (!pte_phys)
> +		return -ENOMEM;
> +
> +	ptep = (pte_t *)phys_to_virt(pte_phys);
> +	init_clear_pgtable(ptep);

No need for this, you're about to fill the table with ptes so clearing it is a
waste of time.

> +	prot = __pgprot(pgprot_val(prot) | PTE_TYPE_PAGE);

This happen to work for D64 pgtables because of the way the bits are arranged.
But it won't work for D128 (when we get there). We are in the process of
cleaning up the code base to make it D128 ready. So let's fix this now:

	prot = __pgprot(pgprot_val(prot) & ~PMD_TYPE_MASK) | PTE_TYPE_PAGE);

nit: I'd move this up, next to the "prot = pmd_pgprot(pmdval);" line.

> +	for (int i = 0; i < PTRS_PER_PTE; i++, ptep++)
> +		__set_pte_nosync(ptep, pfn_pte(pfn + i, prot));

nit: you're incrementing ptep but adding i to pfn. Why not just increment pfn too?

> +
> +	dsb(ishst);
> +
> +	set_pmd(pmdp, pfn_pmd(__phys_to_pfn(pte_phys),
> +		__pgprot(PMD_TYPE_TABLE)));

You're missing some required pgprot flags and it would be better to follow what
alloc_init_cont_pte() does in general. Something like:

	pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
	if (flags & NO_EXEC_MAPPINGS)
		pmdval |= PMD_TABLE_PXN;
	__pmd_populate(pmdp, pte_phys, pmdval);

> +
> +	return 0;
> +}
> +
> +static int split_pud(pud_t *pudp, pud_t pudval,
> +		     phys_addr_t (*pgtable_alloc)(int))

All the same comments for split_pmd() apply here too.

> +{
> +	unsigned long pfn;
> +	pgprot_t prot;
> +	pmd_t *pmdp;
> +	phys_addr_t pmd_phys;
> +	unsigned int step;
> +
> +	if (!pud_leaf(pudval))
> +		return 0;
> +
> +	pfn = pud_pfn(pudval);
> +	prot = pud_pgprot(pudval);
> +	step = PMD_SIZE >> PAGE_SHIFT;
> +
> +	pmd_phys = pgtable_alloc(PMD_SHIFT);
> +	if (!pmd_phys)
> +		return -ENOMEM;
> +
> +	pmdp = (pmd_t *)phys_to_virt(pmd_phys);
> +	init_clear_pgtable(pmdp);
> +	for (int i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
> +		__set_pmd_nosync(pmdp, pfn_pmd(pfn, prot));
> +		pfn += step;
> +	}
> +
> +	dsb(ishst);
> +
> +	set_pud(pudp, pfn_pud(__phys_to_pfn(pmd_phys),
> +		__pgprot(PUD_TYPE_TABLE)));
> +
> +	return 0;
> +}
> +
>  static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>  		     phys_addr_t phys, pgprot_t prot)
>  {
> @@ -251,12 +319,21 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>  {
>  	unsigned long next;
>  	int ret = 0;
> +	bool split = flags & SPLIT_MAPPINGS;
>  
>  	do {
>  		pmd_t old_pmd = READ_ONCE(*pmdp);
>  
>  		next = pmd_addr_end(addr, end);
>  
> +		if (split) {

I think this should be:

		if (flags & SPLIT_MAPPINGS &&
		    pmd_leaf(old_pmd) &&
		    next < addr + PMD_SIZE) {

So we only attempt a split if its a leaf and the leaf is not fully contained by
the range. Your current code is always splitting even if the block mapping is
fully contained which seems a waste. And if the pmd is not a leaf (either not
present or a table) split_pmd will currently do nothing and return 0, so there
is no opportunity to install mappings or visit the ptes.

> +			ret = split_pmd(pmdp, old_pmd, pgtable_alloc);

But... do we need the special split_pmd() and split_pud() functions at all?
Can't we just allocate a new table here, then let the existing code populate it,
then replace the block mapping with the table mapping? Same goes for huge puds.
If you take this approach, I think a lot of the code below will significantly
simplify.

> +			if (ret)
> +				break;
> +
> +			continue;
> +		}
> +
>  		/* try section mapping first */
>  		if (((addr | next | phys) & ~PMD_MASK) == 0 &&
>  		    (flags & NO_BLOCK_MAPPINGS) == 0) {

You'll want to modify this last bit to avoid setting up a block mapping if we
are trying to split?

		    (flags & (NO_BLOCK_MAPPINGS | SPLIT_MAPPINGS) == 0) {

Or perhaps it's an error to call this without NO_BLOCK_MAPPINGS if
SPLIT_MAPPINGS is specified? Or perhaps we don't even need SPLIT_MAPPINGS, and
NO_BLOCK_MAPPINGS means we will split if we find any block mappings? (similarly
for NO_CONT_MAPPINGS)?

> @@ -292,11 +369,19 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>  	int ret = 0;
>  	pud_t pud = READ_ONCE(*pudp);
>  	pmd_t *pmdp;
> +	bool split = flags & SPLIT_MAPPINGS;
>  
>  	/*
>  	 * Check for initial section mappings in the pgd/pud.
>  	 */
>  	BUG_ON(pud_sect(pud));
> +
> +	if (split) {
> +		BUG_ON(pud_none(pud));
> +		pmdp = pmd_offset(pudp, addr);
> +		goto split_pgtable;
> +	}
> +
>  	if (pud_none(pud)) {
>  		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
>  		phys_addr_t pmd_phys;
> @@ -316,6 +401,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>  		pmdp = pmd_set_fixmap_offset(pudp, addr);
>  	}
>  
> +split_pgtable:
>  	do {
>  		pgprot_t __prot = prot;
>  
> @@ -334,7 +420,8 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>  		phys += next - addr;
>  	} while (addr = next, addr != end);
>  
> -	pmd_clear_fixmap();
> +	if (!split)
> +		pmd_clear_fixmap();
>  
>  	return ret;
>  }
> @@ -348,6 +435,13 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>  	int ret = 0;
>  	p4d_t p4d = READ_ONCE(*p4dp);
>  	pud_t *pudp;
> +	bool split = flags & SPLIT_MAPPINGS;
> +
> +	if (split) {
> +		BUG_ON(p4d_none(p4d));
> +		pudp = pud_offset(p4dp, addr);
> +		goto split_pgtable;
> +	}
>  
>  	if (p4d_none(p4d)) {
>  		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF;
> @@ -368,11 +462,25 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>  		pudp = pud_set_fixmap_offset(p4dp, addr);
>  	}
>  
> +split_pgtable:
>  	do {
>  		pud_t old_pud = READ_ONCE(*pudp);
>  
>  		next = pud_addr_end(addr, end);
>  
> +		if (split) {
> +			ret = split_pud(pudp, old_pud, pgtable_alloc);
> +			if (ret)
> +				break;
> +
> +			ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
> +						  pgtable_alloc, flags);
> +			if (ret)
> +				break;
> +
> +			continue;
> +		}
> +
>  		/*
>  		 * For 4K granule only, attempt to put down a 1GB block
>  		 */
> @@ -399,7 +507,8 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>  		phys += next - addr;
>  	} while (pudp++, addr = next, addr != end);
>  
> -	pud_clear_fixmap();
> +	if (!split)
> +		pud_clear_fixmap();
>  
>  	return ret;
>  }
> @@ -413,6 +522,13 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  	int ret = 0;
>  	pgd_t pgd = READ_ONCE(*pgdp);
>  	p4d_t *p4dp;
> +	bool split = flags & SPLIT_MAPPINGS;
> +
> +	if (split) {
> +		BUG_ON(pgd_none(pgd));
> +		p4dp = p4d_offset(pgdp, addr);
> +		goto split_pgtable;
> +	}
>  
>  	if (pgd_none(pgd)) {
>  		pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF;
> @@ -433,6 +549,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  		p4dp = p4d_set_fixmap_offset(pgdp, addr);
>  	}
>  
> +split_pgtable:
>  	do {
>  		p4d_t old_p4d = READ_ONCE(*p4dp);
>  
> @@ -449,7 +566,8 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>  		phys += next - addr;
>  	} while (p4dp++, addr = next, addr != end);
>  
> -	p4d_clear_fixmap();
> +	if (!split)
> +		p4d_clear_fixmap();
>  
>  	return ret;
>  }
> @@ -546,6 +664,23 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
>  	return pa;
>  }
>  
> +int split_linear_mapping(unsigned long start, unsigned long end)
> +{
> +	int ret = 0;
> +
> +	if (!system_supports_bbml2_noabort())
> +		return 0;
> +
> +	mmap_write_lock(&init_mm);
> +	ret = __create_pgd_mapping_locked(init_mm.pgd, virt_to_phys((void *)start),
> +					  start, (end - start), __pgprot(0),
> +					  __pgd_pgtable_alloc, SPLIT_MAPPINGS);
> +	mmap_write_unlock(&init_mm);
> +	flush_tlb_kernel_range(start, end);
> +
> +	return ret;
> +}
> +
>  /*
>   * This function can only be used to modify existing table entries,
>   * without allocating new levels of table. Note that this permits the
> @@ -665,6 +800,24 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
>  
>  #endif /* CONFIG_KFENCE */
>  
> +static inline bool force_pte_mapping(void)
> +{
> +	/*
> +	 * Can't use cpufeature API to determine whether BBML2 supported
> +	 * or not since cpufeature have not been finalized yet.
> +	 *
> +	 * Checking the boot CPU only for now.  If the boot CPU has
> +	 * BBML2, paint linear mapping with block mapping.  If it turns
> +	 * out the secondary CPUs don't support BBML2 once cpufeature is
> +	 * fininalized, the linear mapping will be repainted with PTE
> +	 * mapping.
> +	 */
> +	return (rodata_full && !bbml2_noabort_available()) ||
> +		debug_pagealloc_enabled() ||
> +		arm64_kfence_can_set_direct_map() ||
> +		is_realm_world();
> +}
> +
>  static void __init map_mem(pgd_t *pgdp)
>  {
>  	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
> @@ -690,9 +843,12 @@ static void __init map_mem(pgd_t *pgdp)
>  
>  	early_kfence_pool = arm64_kfence_alloc_pool();
>  
> -	if (can_set_direct_map())
> +	if (force_pte_mapping())
>  		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>  
> +	if (rodata_full)
> +		flags |= NO_CONT_MAPPINGS;
> +
>  	/*
>  	 * Take care not to create a writable alias for the
>  	 * read-only text and rodata sections of the kernel image.
> @@ -1388,9 +1544,12 @@ int arch_add_memory(int nid, u64 start, u64 size,
>  
>  	VM_BUG_ON(!mhp_range_allowed(start, size, true));
>  
> -	if (can_set_direct_map())
> +	if (force_pte_mapping())
>  		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>  
> +	if (rodata_full)
> +		flags |= NO_CONT_MAPPINGS;
> +
>  	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>  			     size, params->pgprot, __pgd_pgtable_alloc,
>  			     flags);
> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
> index 39fd1f7ff02a..5d42d87ea7e1 100644
> --- a/arch/arm64/mm/pageattr.c
> +++ b/arch/arm64/mm/pageattr.c
> @@ -10,6 +10,7 @@
>  #include <linux/vmalloc.h>
>  
>  #include <asm/cacheflush.h>
> +#include <asm/mmu.h>
>  #include <asm/pgtable-prot.h>
>  #include <asm/set_memory.h>
>  #include <asm/tlbflush.h>
> @@ -80,8 +81,9 @@ static int change_memory_common(unsigned long addr, int numpages,
>  	unsigned long start = addr;
>  	unsigned long size = PAGE_SIZE * numpages;
>  	unsigned long end = start + size;
> +	unsigned long l_start;
>  	struct vm_struct *area;
> -	int i;
> +	int i, ret;
>  
>  	if (!PAGE_ALIGNED(addr)) {
>  		start &= PAGE_MASK;
> @@ -118,7 +120,12 @@ static int change_memory_common(unsigned long addr, int numpages,
>  	if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
>  			    pgprot_val(clear_mask) == PTE_RDONLY)) {
>  		for (i = 0; i < area->nr_pages; i++) {
> -			__change_memory_common((u64)page_address(area->pages[i]),
> +			l_start = (u64)page_address(area->pages[i]);
> +			ret = split_linear_mapping(l_start, l_start + PAGE_SIZE);

This isn't quite aligned with how I was thinking about it. You still have 2
passes here; one to split the range to base pages, then another to modify the
permissions.

I was thinking we could use the table walker in mmu.c to achieve 2 benefits:

  - Do both operations in a single pass (a bit like how calling
update_mapping_prot() will update the protections on an existing mapping, and
the table walker will split when it comes across a huge page)

  - Only split when needed; if the whole huge page is contained within the
range, then there is no need to split in the first place.

We could then split vmalloc regions for free using this infrastructure too.

Although there is a wrinkle that the mmu.c table walker only accepts a pgprot
and can't currently handle a set_mask/clear_mask. I guess that could be added,
but it starts to get a bit busy. I think this generic infra would be useful
though. What do you think?

[...]

Thanks,
Ryan

Re: [v3 PATCH 4/6] arm64: mm: support large block mapping when rodata=full

Posted by Yang Shi 10 months, 3 weeks ago


On 3/14/25 6:29 AM, Ryan Roberts wrote:
> On 04/03/2025 22:19, Yang Shi wrote:
>> When rodata=full is specified, kernel linear mapping has to be mapped at
>> PTE level since large page table can't be split due to break-before-make
>> rule on ARM64.
>>
>> This resulted in a couple of problems:
>>    - performance degradation
>>    - more TLB pressure
>>    - memory waste for kernel page table
>>
>> With FEAT_BBM level 2 support, splitting large block page table to
>> smaller ones doesn't need to make the page table entry invalid anymore.
>> This allows kernel split large block mapping on the fly.
>>
>> Add kernel page table split support and use large block mapping by
>> default when FEAT_BBM level 2 is supported for rodata=full.  When
>> changing permissions for kernel linear mapping, the page table will be
>> split to PTE level.
>>
>> The machine without FEAT_BBM level 2 will fallback to have kernel linear
>> mapping PTE-mapped when rodata=full.
>>
>> With this we saw significant performance boost with some benchmarks and
>> much less memory consumption on my AmpereOne machine (192 cores, 1P) with
>> 256GB memory.
>>
>> * Memory use after boot
>> Before:
>> MemTotal:       258988984 kB
>> MemFree:        254821700 kB
>>
>> After:
>> MemTotal:       259505132 kB
>> MemFree:        255410264 kB
>>
>> Around 500MB more memory are free to use.  The larger the machine, the
>> more memory saved.
>>
>> * Memcached
>> We saw performance degradation when running Memcached benchmark with
>> rodata=full vs rodata=on.  Our profiling pointed to kernel TLB pressure.
>> With this patchset we saw ops/sec is increased by around 3.5%, P99
>> latency is reduced by around 9.6%.
>> The gain mainly came from reduced kernel TLB misses.  The kernel TLB
>> MPKI is reduced by 28.5%.
>>
>> The benchmark data is now on par with rodata=on too.
>>
>> * Disk encryption (dm-crypt) benchmark
>> Ran fio benchmark with the below command on a 128G ramdisk (ext4) with disk
>> encryption (by dm-crypt).
>> fio --directory=/data --random_generator=lfsr --norandommap --randrepeat 1 \
>>      --status-interval=999 --rw=write --bs=4k --loops=1 --ioengine=sync \
>>      --iodepth=1 --numjobs=1 --fsync_on_close=1 --group_reporting --thread \
>>      --name=iops-test-job --eta-newline=1 --size 100G
>>
>> The IOPS is increased by 90% - 150% (the variance is high, but the worst
>> number of good case is around 90% more than the best number of bad case).
>> The bandwidth is increased and the avg clat is reduced proportionally.
>>
>> * Sequential file read
>> Read 100G file sequentially on XFS (xfs_io read with page cache populated).
>> The bandwidth is increased by 150%.
>>
>> Signed-off-by: Yang Shi<yang@os.amperecomputing.com>
>> ---
>>   arch/arm64/include/asm/cpufeature.h |  10 ++
>>   arch/arm64/include/asm/mmu.h        |   1 +
>>   arch/arm64/include/asm/pgtable.h    |   7 +-
>>   arch/arm64/kernel/cpufeature.c      |   2 +-
>>   arch/arm64/mm/mmu.c                 | 169 +++++++++++++++++++++++++++-
>>   arch/arm64/mm/pageattr.c            |  35 +++++-
>>   6 files changed, 211 insertions(+), 13 deletions(-)
>>
>> diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
>> index 108ef3fbbc00..e24edc32b0bd 100644
>> --- a/arch/arm64/include/asm/cpufeature.h
>> +++ b/arch/arm64/include/asm/cpufeature.h
>> @@ -871,6 +871,16 @@ static inline bool system_supports_bbml2_noabort(void)
>>   	return alternative_has_cap_unlikely(ARM64_HAS_BBML2_NOABORT);
>>   }
>>   
>> +bool cpu_has_bbml2_noabort(unsigned int cpu_midr);
>> +/*
>> + * Called at early boot stage on boot CPU before cpu info and cpu feature
>> + * are ready.
>> + */
>> +static inline bool bbml2_noabort_available(void)
>> +{
>> +	return cpu_has_bbml2_noabort(read_cpuid_id());
> You'll want to incorporate the IS_ENABLED(CONFIG_ARM64_BBML2_NOABORT) and
> arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOBBML2) checks from
> Miko's new series to avoid block mappings when BBML2 is disabled. (that second
> check will change a bit based on Maz's feedback against Miko's v3).

Sure

> Hopefully we can factor out into a common helper that is used by Miko's stuff too?

I think checking the kernel config and 
arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_NOBBML2) can be 
consolidated into a helper?

>> +}
>> +
>>   int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt);
>>   bool try_emulate_mrs(struct pt_regs *regs, u32 isn);
>>   
>> diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h
>> index 662471cfc536..d658a33df266 100644
>> --- a/arch/arm64/include/asm/mmu.h
>> +++ b/arch/arm64/include/asm/mmu.h
>> @@ -71,6 +71,7 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
>>   			       pgprot_t prot, bool page_mappings_only);
>>   extern void *fixmap_remap_fdt(phys_addr_t dt_phys, int *size, pgprot_t prot);
>>   extern void mark_linear_text_alias_ro(void);
>> +extern int split_linear_mapping(unsigned long start, unsigned long end);
>>   
>>   /*
>>    * This check is triggered during the early boot before the cpufeature
>> diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
>> index 0b2a2ad1b9e8..ed2fc1dcf7ae 100644
>> --- a/arch/arm64/include/asm/pgtable.h
>> +++ b/arch/arm64/include/asm/pgtable.h
>> @@ -749,7 +749,7 @@ static inline bool in_swapper_pgdir(void *addr)
>>   	        ((unsigned long)swapper_pg_dir & PAGE_MASK);
>>   }
>>   
>> -static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>> +static inline void __set_pmd_nosync(pmd_t *pmdp, pmd_t pmd)
>>   {
>>   #ifdef __PAGETABLE_PMD_FOLDED
>>   	if (in_swapper_pgdir(pmdp)) {
>> @@ -759,6 +759,11 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>>   #endif /* __PAGETABLE_PMD_FOLDED */
>>   
>>   	WRITE_ONCE(*pmdp, pmd);
>> +}
>> +
>> +static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
>> +{
>> +	__set_pmd_nosync(pmdp, pmd);
>>   
>>   	if (pmd_valid(pmd)) {
>>   		dsb(ishst);
>> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
>> index bf3df8407ca3..d39637d5aeab 100644
>> --- a/arch/arm64/kernel/cpufeature.c
>> +++ b/arch/arm64/kernel/cpufeature.c
>> @@ -2176,7 +2176,7 @@ static bool hvhe_possible(const struct arm64_cpu_capabilities *entry,
>>   	return arm64_test_sw_feature_override(ARM64_SW_FEATURE_OVERRIDE_HVHE);
>>   }
>>   
>> -static bool cpu_has_bbml2_noabort(unsigned int cpu_midr)
>> +bool cpu_has_bbml2_noabort(unsigned int cpu_midr)
>>   {
>>   	/* We want to allow usage of bbml2 in as wide a range of kernel contexts
>>   	 * as possible. This list is therefore an allow-list of known-good
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index dccf0877285b..ad0f1cc55e3a 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -45,6 +45,7 @@
>>   #define NO_BLOCK_MAPPINGS	BIT(0)
>>   #define NO_CONT_MAPPINGS	BIT(1)
>>   #define NO_EXEC_MAPPINGS	BIT(2)	/* assumes FEAT_HPDS is not used */
>> +#define SPLIT_MAPPINGS		BIT(3)
>>   
>>   u64 kimage_voffset __ro_after_init;
>>   EXPORT_SYMBOL(kimage_voffset);
>> @@ -166,6 +167,73 @@ static void init_clear_pgtable(void *table)
>>   	dsb(ishst);
>>   }
>>   
>> +static int split_pmd(pmd_t *pmdp, pmd_t pmdval,
>> +		     phys_addr_t (*pgtable_alloc)(int))
>> +{
>> +	unsigned long pfn;
>> +	pgprot_t prot;
>> +	phys_addr_t pte_phys;
>> +	pte_t *ptep;
>> +
>> +	if (!pmd_leaf(pmdval))
>> +		return 0;
>> +
>> +	pfn = pmd_pfn(pmdval);
>> +	prot = pmd_pgprot(pmdval);
>> +
>> +	pte_phys = pgtable_alloc(PAGE_SHIFT);
>> +	if (!pte_phys)
>> +		return -ENOMEM;
>> +
>> +	ptep = (pte_t *)phys_to_virt(pte_phys);
>> +	init_clear_pgtable(ptep);
> No need for this, you're about to fill the table with ptes so clearing it is a
> waste of time.

OK

>> +	prot = __pgprot(pgprot_val(prot) | PTE_TYPE_PAGE);
> This happen to work for D64 pgtables because of the way the bits are arranged.
> But it won't work for D128 (when we get there). We are in the process of
> cleaning up the code base to make it D128 ready. So let's fix this now:
>
> 	prot = __pgprot(pgprot_val(prot) & ~PMD_TYPE_MASK) | PTE_TYPE_PAGE);
>
> nit: I'd move this up, next to the "prot = pmd_pgprot(pmdval);" line.

OK

>> +	for (int i = 0; i < PTRS_PER_PTE; i++, ptep++)
>> +		__set_pte_nosync(ptep, pfn_pte(pfn + i, prot));
> nit: you're incrementing ptep but adding i to pfn. Why not just increment pfn too?

Sure, pfn++ works too.

>> +
>> +	dsb(ishst);
>> +
>> +	set_pmd(pmdp, pfn_pmd(__phys_to_pfn(pte_phys),
>> +		__pgprot(PMD_TYPE_TABLE)));
> You're missing some required pgprot flags and it would be better to follow what
> alloc_init_cont_pte() does in general. Something like:
>
> 	pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
> 	if (flags & NO_EXEC_MAPPINGS)
> 		pmdval |= PMD_TABLE_PXN;
> 	__pmd_populate(pmdp, pte_phys, pmdval);

Sure

>> +
>> +	return 0;
>> +}
>> +
>> +static int split_pud(pud_t *pudp, pud_t pudval,
>> +		     phys_addr_t (*pgtable_alloc)(int))
> All the same comments for split_pmd() apply here too.
>
>> +{
>> +	unsigned long pfn;
>> +	pgprot_t prot;
>> +	pmd_t *pmdp;
>> +	phys_addr_t pmd_phys;
>> +	unsigned int step;
>> +
>> +	if (!pud_leaf(pudval))
>> +		return 0;
>> +
>> +	pfn = pud_pfn(pudval);
>> +	prot = pud_pgprot(pudval);
>> +	step = PMD_SIZE >> PAGE_SHIFT;
>> +
>> +	pmd_phys = pgtable_alloc(PMD_SHIFT);
>> +	if (!pmd_phys)
>> +		return -ENOMEM;
>> +
>> +	pmdp = (pmd_t *)phys_to_virt(pmd_phys);
>> +	init_clear_pgtable(pmdp);
>> +	for (int i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
>> +		__set_pmd_nosync(pmdp, pfn_pmd(pfn, prot));
>> +		pfn += step;
>> +	}
>> +
>> +	dsb(ishst);
>> +
>> +	set_pud(pudp, pfn_pud(__phys_to_pfn(pmd_phys),
>> +		__pgprot(PUD_TYPE_TABLE)));
>> +
>> +	return 0;
>> +}
>> +
>>   static void init_pte(pte_t *ptep, unsigned long addr, unsigned long end,
>>   		     phys_addr_t phys, pgprot_t prot)
>>   {
>> @@ -251,12 +319,21 @@ static int init_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end,
>>   {
>>   	unsigned long next;
>>   	int ret = 0;
>> +	bool split = flags & SPLIT_MAPPINGS;
>>   
>>   	do {
>>   		pmd_t old_pmd = READ_ONCE(*pmdp);
>>   
>>   		next = pmd_addr_end(addr, end);
>>   
>> +		if (split) {
> I think this should be:
>
> 		if (flags & SPLIT_MAPPINGS &&
> 		    pmd_leaf(old_pmd) &&
> 		    next < addr + PMD_SIZE) {
>
> So we only attempt a split if its a leaf and the leaf is not fully contained by
> the range. Your current code is always splitting even if the block mapping is
> fully contained which seems a waste. And if the pmd is not a leaf (either not
> present or a table) split_pmd will currently do nothing and return 0, so there
> is no opportunity to install mappings or visit the ptes.

Yes, it splits the PMD even though the block mapping is fully contained. 
It is because the current user (change_memory_common()) just manipulates 
the page permission on PAGE_SIZE granularity IIRC. But I agree with you 
not splitting when it is fully contained is better and more flexible. We 
don't have to change the code if change_memory_common() is enhanced to 
handle contiguous pages. However the related code would be untested due 
to no usecase at the moment.

If the PMD is non-leaf it will do nothing because this patch doesn't 
handle CONT_PTE, if the PMD is table it means it already points to a PTE 
so we don't need do anything. The later patch handles CONT_PTE.

>> +			ret = split_pmd(pmdp, old_pmd, pgtable_alloc);
> But... do we need the special split_pmd() and split_pud() functions at all?
> Can't we just allocate a new table here, then let the existing code populate it,
> then replace the block mapping with the table mapping? Same goes for huge puds.
> If you take this approach, I think a lot of the code below will significantly
> simplify.

Actually I thought about this. The existing code populates page table in 
the range of size@addr, if the size is, for example, pud size, the 
existing code can populate the page table as you suggested. But as I 
mentioned above change_memory_common() is called on PAGE_SIZE 
granularity, If we just allocate a page table then let the existing code 
populate it, we will end up populating just one PMD and PTE entry for 
the specified address. For example, we a module is loaded, its text 
segment may just use one page, so kernel just need change the permission 
for that page.

So we still need populate other PMD entries and PTE entries other than 
the specified address. That would need the most code in split_pud() and 
split_pmd().

To make your suggestion work I think we can set addr and end used by the 
walker to the start boundary and end boundary of PUD (P4D doesn't 
support block mapping yet) respectively. For example:

@@ -441,8 +441,14 @@ static void __create_pgd_mapping_locked(pgd_t 
*pgdir, phys_addr_t phys,
                 return;

         phys &= PAGE_MASK;
-       addr = virt & PAGE_MASK;
-       end = PAGE_ALIGN(virt + size);
+       if (split) {
+               addr = virt & PAGE_MASK;
+               end = PAGE_ALIGN(virt + size);
+       else {
+               addr = start_pud_boundary;
+               end = end_pud_boundary;
+               phys = __pa(start_boundary);
+       }

         do {
                 next = pgd_addr_end(addr, end);

But we may need to add a dedicated parameter for the start boundary of 
page table if we want to do split and permission change in one pass as 
you suggested below since we need to know which PTE permission need to 
be changed. However this may make detecting fully contained range 
harder, the range passed in from caller needs to be preserved so that we 
can know what PUD or PMD permission need to be changed. CONT mappings 
will make it more complicated.

So it sounds like we need much more parameters. We may need put all the 
parameters into a struct, for example, something like below off the top 
of my head:

struct walk_param {
     unsigned long start;
     unsigned long end;
     unsigned long addr;
     unsigned long orig_start;
     unsigned long orig_end;
     pgprot_t clear_prot;
     pgprot_t set_prot;
     pgprot_t prot;
}

So I'm not sure whether the code can be significantly simplified or not.

>> +			if (ret)
>> +				break;
>> +
>> +			continue;
>> +		}
>> +
>>   		/* try section mapping first */
>>   		if (((addr | next | phys) & ~PMD_MASK) == 0 &&
>>   		    (flags & NO_BLOCK_MAPPINGS) == 0) {
> You'll want to modify this last bit to avoid setting up a block mapping if we
> are trying to split?
>
> 		    (flags & (NO_BLOCK_MAPPINGS | SPLIT_MAPPINGS) == 0) {

The specified address can't have block mapping, but the surrounding 
address can have. For example, when splitting a PUD, the PMD containing 
the specified address will be table, but all other 511 PMDs still can be 
block mappings.

> Or perhaps it's an error to call this without NO_BLOCK_MAPPINGS if
> SPLIT_MAPPINGS is specified? Or perhaps we don't even need SPLIT_MAPPINGS, and
> NO_BLOCK_MAPPINGS means we will split if we find any block mappings? (similarly
> for NO_CONT_MAPPINGS)?

As I said above we still can have block mappings, so using 
NO_BLOCK_MAPPINGS may cause some confusion?

>> @@ -292,11 +369,19 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>>   	int ret = 0;
>>   	pud_t pud = READ_ONCE(*pudp);
>>   	pmd_t *pmdp;
>> +	bool split = flags & SPLIT_MAPPINGS;
>>   
>>   	/*
>>   	 * Check for initial section mappings in the pgd/pud.
>>   	 */
>>   	BUG_ON(pud_sect(pud));
>> +
>> +	if (split) {
>> +		BUG_ON(pud_none(pud));
>> +		pmdp = pmd_offset(pudp, addr);
>> +		goto split_pgtable;
>> +	}
>> +
>>   	if (pud_none(pud)) {
>>   		pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
>>   		phys_addr_t pmd_phys;
>> @@ -316,6 +401,7 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>>   		pmdp = pmd_set_fixmap_offset(pudp, addr);
>>   	}
>>   
>> +split_pgtable:
>>   	do {
>>   		pgprot_t __prot = prot;
>>   
>> @@ -334,7 +420,8 @@ static int alloc_init_cont_pmd(pud_t *pudp, unsigned long addr,
>>   		phys += next - addr;
>>   	} while (addr = next, addr != end);
>>   
>> -	pmd_clear_fixmap();
>> +	if (!split)
>> +		pmd_clear_fixmap();
>>   
>>   	return ret;
>>   }
>> @@ -348,6 +435,13 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   	int ret = 0;
>>   	p4d_t p4d = READ_ONCE(*p4dp);
>>   	pud_t *pudp;
>> +	bool split = flags & SPLIT_MAPPINGS;
>> +
>> +	if (split) {
>> +		BUG_ON(p4d_none(p4d));
>> +		pudp = pud_offset(p4dp, addr);
>> +		goto split_pgtable;
>> +	}
>>   
>>   	if (p4d_none(p4d)) {
>>   		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF;
>> @@ -368,11 +462,25 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   		pudp = pud_set_fixmap_offset(p4dp, addr);
>>   	}
>>   
>> +split_pgtable:
>>   	do {
>>   		pud_t old_pud = READ_ONCE(*pudp);
>>   
>>   		next = pud_addr_end(addr, end);
>>   
>> +		if (split) {
>> +			ret = split_pud(pudp, old_pud, pgtable_alloc);
>> +			if (ret)
>> +				break;
>> +
>> +			ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
>> +						  pgtable_alloc, flags);
>> +			if (ret)
>> +				break;
>> +
>> +			continue;
>> +		}
>> +
>>   		/*
>>   		 * For 4K granule only, attempt to put down a 1GB block
>>   		 */
>> @@ -399,7 +507,8 @@ static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
>>   		phys += next - addr;
>>   	} while (pudp++, addr = next, addr != end);
>>   
>> -	pud_clear_fixmap();
>> +	if (!split)
>> +		pud_clear_fixmap();
>>   
>>   	return ret;
>>   }
>> @@ -413,6 +522,13 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>>   	int ret = 0;
>>   	pgd_t pgd = READ_ONCE(*pgdp);
>>   	p4d_t *p4dp;
>> +	bool split = flags & SPLIT_MAPPINGS;
>> +
>> +	if (split) {
>> +		BUG_ON(pgd_none(pgd));
>> +		p4dp = p4d_offset(pgdp, addr);
>> +		goto split_pgtable;
>> +	}
>>   
>>   	if (pgd_none(pgd)) {
>>   		pgdval_t pgdval = PGD_TYPE_TABLE | PGD_TABLE_UXN | PGD_TABLE_AF;
>> @@ -433,6 +549,7 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>>   		p4dp = p4d_set_fixmap_offset(pgdp, addr);
>>   	}
>>   
>> +split_pgtable:
>>   	do {
>>   		p4d_t old_p4d = READ_ONCE(*p4dp);
>>   
>> @@ -449,7 +566,8 @@ static int alloc_init_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end,
>>   		phys += next - addr;
>>   	} while (p4dp++, addr = next, addr != end);
>>   
>> -	p4d_clear_fixmap();
>> +	if (!split)
>> +		p4d_clear_fixmap();
>>   
>>   	return ret;
>>   }
>> @@ -546,6 +664,23 @@ static phys_addr_t pgd_pgtable_alloc(int shift)
>>   	return pa;
>>   }
>>   
>> +int split_linear_mapping(unsigned long start, unsigned long end)
>> +{
>> +	int ret = 0;
>> +
>> +	if (!system_supports_bbml2_noabort())
>> +		return 0;
>> +
>> +	mmap_write_lock(&init_mm);
>> +	ret = __create_pgd_mapping_locked(init_mm.pgd, virt_to_phys((void *)start),
>> +					  start, (end - start), __pgprot(0),
>> +					  __pgd_pgtable_alloc, SPLIT_MAPPINGS);
>> +	mmap_write_unlock(&init_mm);
>> +	flush_tlb_kernel_range(start, end);
>> +
>> +	return ret;
>> +}
>> +
>>   /*
>>    * This function can only be used to modify existing table entries,
>>    * without allocating new levels of table. Note that this permits the
>> @@ -665,6 +800,24 @@ static inline void arm64_kfence_map_pool(phys_addr_t kfence_pool, pgd_t *pgdp) {
>>   
>>   #endif /* CONFIG_KFENCE */
>>   
>> +static inline bool force_pte_mapping(void)
>> +{
>> +	/*
>> +	 * Can't use cpufeature API to determine whether BBML2 supported
>> +	 * or not since cpufeature have not been finalized yet.
>> +	 *
>> +	 * Checking the boot CPU only for now.  If the boot CPU has
>> +	 * BBML2, paint linear mapping with block mapping.  If it turns
>> +	 * out the secondary CPUs don't support BBML2 once cpufeature is
>> +	 * fininalized, the linear mapping will be repainted with PTE
>> +	 * mapping.
>> +	 */
>> +	return (rodata_full && !bbml2_noabort_available()) ||
>> +		debug_pagealloc_enabled() ||
>> +		arm64_kfence_can_set_direct_map() ||
>> +		is_realm_world();
>> +}
>> +
>>   static void __init map_mem(pgd_t *pgdp)
>>   {
>>   	static const u64 direct_map_end = _PAGE_END(VA_BITS_MIN);
>> @@ -690,9 +843,12 @@ static void __init map_mem(pgd_t *pgdp)
>>   
>>   	early_kfence_pool = arm64_kfence_alloc_pool();
>>   
>> -	if (can_set_direct_map())
>> +	if (force_pte_mapping())
>>   		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>>   
>> +	if (rodata_full)
>> +		flags |= NO_CONT_MAPPINGS;
>> +
>>   	/*
>>   	 * Take care not to create a writable alias for the
>>   	 * read-only text and rodata sections of the kernel image.
>> @@ -1388,9 +1544,12 @@ int arch_add_memory(int nid, u64 start, u64 size,
>>   
>>   	VM_BUG_ON(!mhp_range_allowed(start, size, true));
>>   
>> -	if (can_set_direct_map())
>> +	if (force_pte_mapping())
>>   		flags |= NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>>   
>> +	if (rodata_full)
>> +		flags |= NO_CONT_MAPPINGS;
>> +
>>   	__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>>   			     size, params->pgprot, __pgd_pgtable_alloc,
>>   			     flags);
>> diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c
>> index 39fd1f7ff02a..5d42d87ea7e1 100644
>> --- a/arch/arm64/mm/pageattr.c
>> +++ b/arch/arm64/mm/pageattr.c
>> @@ -10,6 +10,7 @@
>>   #include <linux/vmalloc.h>
>>   
>>   #include <asm/cacheflush.h>
>> +#include <asm/mmu.h>
>>   #include <asm/pgtable-prot.h>
>>   #include <asm/set_memory.h>
>>   #include <asm/tlbflush.h>
>> @@ -80,8 +81,9 @@ static int change_memory_common(unsigned long addr, int numpages,
>>   	unsigned long start = addr;
>>   	unsigned long size = PAGE_SIZE * numpages;
>>   	unsigned long end = start + size;
>> +	unsigned long l_start;
>>   	struct vm_struct *area;
>> -	int i;
>> +	int i, ret;
>>   
>>   	if (!PAGE_ALIGNED(addr)) {
>>   		start &= PAGE_MASK;
>> @@ -118,7 +120,12 @@ static int change_memory_common(unsigned long addr, int numpages,
>>   	if (rodata_full && (pgprot_val(set_mask) == PTE_RDONLY ||
>>   			    pgprot_val(clear_mask) == PTE_RDONLY)) {
>>   		for (i = 0; i < area->nr_pages; i++) {
>> -			__change_memory_common((u64)page_address(area->pages[i]),
>> +			l_start = (u64)page_address(area->pages[i]);
>> +			ret = split_linear_mapping(l_start, l_start + PAGE_SIZE);
> This isn't quite aligned with how I was thinking about it. You still have 2
> passes here; one to split the range to base pages, then another to modify the
> permissions.
>
> I was thinking we could use the table walker in mmu.c to achieve 2 benefits:
>
>    - Do both operations in a single pass (a bit like how calling
> update_mapping_prot() will update the protections on an existing mapping, and
> the table walker will split when it comes across a huge page)
>
>    - Only split when needed; if the whole huge page is contained within the
> range, then there is no need to split in the first place.
>
> We could then split vmalloc regions for free using this infrastructure too.
>
> Although there is a wrinkle that the mmu.c table walker only accepts a pgprot
> and can't currently handle a set_mask/clear_mask. I guess that could be added,
> but it starts to get a bit busy. I think this generic infra would be useful
> though. What do you think?

Yes, we need to add another pgprot parameter (maybe more) to tell the 
walker what is going to be set and what is going to be cleared. I agree 
the generic infra is useful. Would you prefer implement it in this 
patchset or in a separate following patchset?

Thanks,
Yang

> [...]
>
> Thanks,
> Ryan
>

Re: [v3 PATCH 4/6] arm64: mm: support large block mapping when rodata=full

Posted by kernel test robot 11 months, 1 week ago

Hi Yang,

kernel test robot noticed the following build warnings:

[auto build test WARNING on arm64/for-next/core]
[also build test WARNING on linus/master v6.14-rc5 next-20250307]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Yang-Shi/arm64-Add-BBM-Level-2-cpu-feature/20250305-062252
base:   https://git.kernel.org/pub/scm/linux/kernel/git/arm64/linux.git for-next/core
patch link:    https://lore.kernel.org/r/20250304222018.615808-5-yang%40os.amperecomputing.com
patch subject: [v3 PATCH 4/6] arm64: mm: support large block mapping when rodata=full
config: arm64-randconfig-002-20250308 (https://download.01.org/0day-ci/archive/20250308/202503080930.7ZetfmFz-lkp@intel.com/config)
compiler: aarch64-linux-gcc (GCC) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250308/202503080930.7ZetfmFz-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202503080930.7ZetfmFz-lkp@intel.com/

All warnings (new ones prefixed by >>):

   arch/arm64/mm/mmu.c: In function 'alloc_init_pud':
>> arch/arm64/mm/mmu.c:511:35: warning: suggest braces around empty body in an 'if' statement [-Wempty-body]
     511 |                 pud_clear_fixmap();
         |                                   ^
   arch/arm64/mm/mmu.c: In function 'alloc_init_p4d':
   arch/arm64/mm/mmu.c:570:35: warning: suggest braces around empty body in an 'if' statement [-Wempty-body]
     570 |                 p4d_clear_fixmap();
         |                                   ^


vim +/if +511 arch/arm64/mm/mmu.c

d27cfa1fc823d3 Ard Biesheuvel    2017-03-09  428  
2451145c9a60e0 Yang Shi          2025-03-04  429  static int alloc_init_pud(p4d_t *p4dp, unsigned long addr, unsigned long end,
da141706aea52c Laura Abbott      2015-01-21  430  			  phys_addr_t phys, pgprot_t prot,
90292aca9854a2 Yu Zhao           2019-03-11  431  			  phys_addr_t (*pgtable_alloc)(int),
c0951366d4b7e0 Ard Biesheuvel    2017-03-09  432  			  int flags)
c1cc1552616d0f Catalin Marinas   2012-03-05  433  {
c1cc1552616d0f Catalin Marinas   2012-03-05  434  	unsigned long next;
2451145c9a60e0 Yang Shi          2025-03-04  435  	int ret = 0;
e9f6376858b979 Mike Rapoport     2020-06-04  436  	p4d_t p4d = READ_ONCE(*p4dp);
6ed8a3a094b43a Ard Biesheuvel    2024-02-14  437  	pud_t *pudp;
6fad683b9a5c21 Yang Shi          2025-03-04  438  	bool split = flags & SPLIT_MAPPINGS;
6fad683b9a5c21 Yang Shi          2025-03-04  439  
6fad683b9a5c21 Yang Shi          2025-03-04  440  	if (split) {
6fad683b9a5c21 Yang Shi          2025-03-04  441  		BUG_ON(p4d_none(p4d));
6fad683b9a5c21 Yang Shi          2025-03-04  442  		pudp = pud_offset(p4dp, addr);
6fad683b9a5c21 Yang Shi          2025-03-04  443  		goto split_pgtable;
6fad683b9a5c21 Yang Shi          2025-03-04  444  	}
c1cc1552616d0f Catalin Marinas   2012-03-05  445  
e9f6376858b979 Mike Rapoport     2020-06-04  446  	if (p4d_none(p4d)) {
efe72541355d4d Yicong Yang       2024-11-02  447  		p4dval_t p4dval = P4D_TYPE_TABLE | P4D_TABLE_UXN | P4D_TABLE_AF;
132233a759580f Laura Abbott      2016-02-05  448  		phys_addr_t pud_phys;
87143f404f338d Ard Biesheuvel    2021-03-10  449  
87143f404f338d Ard Biesheuvel    2021-03-10  450  		if (flags & NO_EXEC_MAPPINGS)
87143f404f338d Ard Biesheuvel    2021-03-10  451  			p4dval |= P4D_TABLE_PXN;
132233a759580f Laura Abbott      2016-02-05  452  		BUG_ON(!pgtable_alloc);
90292aca9854a2 Yu Zhao           2019-03-11  453  		pud_phys = pgtable_alloc(PUD_SHIFT);
2451145c9a60e0 Yang Shi          2025-03-04  454  		if (!pud_phys)
2451145c9a60e0 Yang Shi          2025-03-04  455  			return -ENOMEM;
0e9df1c905d829 Ryan Roberts      2024-04-12  456  		pudp = pud_set_fixmap(pud_phys);
0e9df1c905d829 Ryan Roberts      2024-04-12  457  		init_clear_pgtable(pudp);
0e9df1c905d829 Ryan Roberts      2024-04-12  458  		pudp += pud_index(addr);
87143f404f338d Ard Biesheuvel    2021-03-10  459  		__p4d_populate(p4dp, pud_phys, p4dval);
0e9df1c905d829 Ryan Roberts      2024-04-12  460  	} else {
e9f6376858b979 Mike Rapoport     2020-06-04  461  		BUG_ON(p4d_bad(p4d));
e9f6376858b979 Mike Rapoport     2020-06-04  462  		pudp = pud_set_fixmap_offset(p4dp, addr);
0e9df1c905d829 Ryan Roberts      2024-04-12  463  	}
0e9df1c905d829 Ryan Roberts      2024-04-12  464  
6fad683b9a5c21 Yang Shi          2025-03-04  465  split_pgtable:
c1cc1552616d0f Catalin Marinas   2012-03-05  466  	do {
20a004e7b017cc Will Deacon       2018-02-15  467  		pud_t old_pud = READ_ONCE(*pudp);
e98216b52176ba Ard Biesheuvel    2016-10-21  468  
c1cc1552616d0f Catalin Marinas   2012-03-05  469  		next = pud_addr_end(addr, end);
206a2a73a62d37 Steve Capper      2014-05-06  470  
6fad683b9a5c21 Yang Shi          2025-03-04  471  		if (split) {
6fad683b9a5c21 Yang Shi          2025-03-04  472  			ret = split_pud(pudp, old_pud, pgtable_alloc);
6fad683b9a5c21 Yang Shi          2025-03-04  473  			if (ret)
6fad683b9a5c21 Yang Shi          2025-03-04  474  				break;
6fad683b9a5c21 Yang Shi          2025-03-04  475  
6fad683b9a5c21 Yang Shi          2025-03-04  476  			ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
6fad683b9a5c21 Yang Shi          2025-03-04  477  						  pgtable_alloc, flags);
6fad683b9a5c21 Yang Shi          2025-03-04  478  			if (ret)
6fad683b9a5c21 Yang Shi          2025-03-04  479  				break;
6fad683b9a5c21 Yang Shi          2025-03-04  480  
6fad683b9a5c21 Yang Shi          2025-03-04  481  			continue;
6fad683b9a5c21 Yang Shi          2025-03-04  482  		}
6fad683b9a5c21 Yang Shi          2025-03-04  483  
206a2a73a62d37 Steve Capper      2014-05-06  484  		/*
206a2a73a62d37 Steve Capper      2014-05-06  485  		 * For 4K granule only, attempt to put down a 1GB block
206a2a73a62d37 Steve Capper      2014-05-06  486  		 */
1310222c276b79 Anshuman Khandual 2022-02-16  487  		if (pud_sect_supported() &&
1310222c276b79 Anshuman Khandual 2022-02-16  488  		   ((addr | next | phys) & ~PUD_MASK) == 0 &&
c0951366d4b7e0 Ard Biesheuvel    2017-03-09  489  		    (flags & NO_BLOCK_MAPPINGS) == 0) {
20a004e7b017cc Will Deacon       2018-02-15  490  			pud_set_huge(pudp, phys, prot);
206a2a73a62d37 Steve Capper      2014-05-06  491  
206a2a73a62d37 Steve Capper      2014-05-06  492  			/*
e98216b52176ba Ard Biesheuvel    2016-10-21  493  			 * After the PUD entry has been populated once, we
e98216b52176ba Ard Biesheuvel    2016-10-21  494  			 * only allow updates to the permission attributes.
206a2a73a62d37 Steve Capper      2014-05-06  495  			 */
e98216b52176ba Ard Biesheuvel    2016-10-21  496  			BUG_ON(!pgattr_change_is_safe(pud_val(old_pud),
20a004e7b017cc Will Deacon       2018-02-15  497  						      READ_ONCE(pud_val(*pudp))));
206a2a73a62d37 Steve Capper      2014-05-06  498  		} else {
2451145c9a60e0 Yang Shi          2025-03-04  499  			ret = alloc_init_cont_pmd(pudp, addr, next, phys, prot,
c0951366d4b7e0 Ard Biesheuvel    2017-03-09  500  					    pgtable_alloc, flags);
2451145c9a60e0 Yang Shi          2025-03-04  501  			if (ret)
2451145c9a60e0 Yang Shi          2025-03-04  502  				break;
e98216b52176ba Ard Biesheuvel    2016-10-21  503  
e98216b52176ba Ard Biesheuvel    2016-10-21  504  			BUG_ON(pud_val(old_pud) != 0 &&
20a004e7b017cc Will Deacon       2018-02-15  505  			       pud_val(old_pud) != READ_ONCE(pud_val(*pudp)));
206a2a73a62d37 Steve Capper      2014-05-06  506  		}
c1cc1552616d0f Catalin Marinas   2012-03-05  507  		phys += next - addr;
20a004e7b017cc Will Deacon       2018-02-15  508  	} while (pudp++, addr = next, addr != end);
f4710445458c0a Mark Rutland      2016-01-25  509  
6fad683b9a5c21 Yang Shi          2025-03-04  510  	if (!split)
f4710445458c0a Mark Rutland      2016-01-25 @511  		pud_clear_fixmap();
2451145c9a60e0 Yang Shi          2025-03-04  512  
2451145c9a60e0 Yang Shi          2025-03-04  513  	return ret;
c1cc1552616d0f Catalin Marinas   2012-03-05  514  }
c1cc1552616d0f Catalin Marinas   2012-03-05  515  

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

[v3 PATCH 1/6] arm64: Add BBM Level 2 cpu feature
[v3 PATCH 2/6] arm64: cpufeature: add AmpereOne to BBML2 allow list
[v3 PATCH 3/6] arm64: mm: make __create_pgd_mapping() and helpers non-void
[v3 PATCH 4/6] arm64: mm: support large block mapping when rodata=full
[v3 PATCH 5/6] arm64: mm: support split CONT mappings
[v3 PATCH 6/6] arm64: mm: split linear mapping if BBML2 is not supported on secondary CPUs