linear_map_split_to_ptes() currently allocates page tables while
splitting the linear mapping into PTEs under stop_machine() using GFP_ATOMIC.
This is fine for non-PREEMPT_RT configurations.
However, it becomes problematic on PREEMPT_RT, because
generic memory allocation/free APIs (e.g. pgtable_alloc(), __get_free_pages(), etc.)
cannot be called from a non-preemptible context, except for the _nolock() variants.
This is because generic memory allocation/free paths are sleepable,
as they rely on spin_lock(), which becomes sleepable on PREEMPT_RT.
In other words, even calling pgtable_alloc() with GFP_ATOMIC is not permitted
in __linear_map_split_to_pte() when it is executed by the stopper thread,
where preemption is disabled on PREEMPT_RT.
To address this, the required number of page tables is first collected
and preallocated, and the preallocated page tables are then used
when splitting the linear mapping in __linear_map_split_to_pte().
Fixes: 3df6979d222b ("arm64: mm: split linear mapping if BBML2 unsupported on secondary CPUs")
Signed-off-by: Yeoreum Yun <yeoreum.yun@arm.com>
Reviewed-by: Ryan Roberts <ryan.roberts@arm.com>
---
arch/arm64/mm/mmu.c | 202 +++++++++++++++++++++++++++++++++++---------
1 file changed, 164 insertions(+), 38 deletions(-)
diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 4b4908ae189b..120874a2d35b 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -529,18 +529,14 @@ static void early_create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys,
panic("Failed to create page tables\n");
}
-static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
- enum pgtable_type pgtable_type)
-{
- /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
- struct ptdesc *ptdesc = pagetable_alloc(gfp & ~__GFP_ZERO, 0);
- phys_addr_t pa;
-
- if (!ptdesc)
- return INVALID_PHYS_ADDR;
-
- pa = page_to_phys(ptdesc_page(ptdesc));
+static struct ptdesc **split_pgtables;
+static unsigned long split_pgtables_count;
+static unsigned long split_pgtables_idx;
+static void __pgd_pgtable_init(struct mm_struct *mm,
+ struct ptdesc *ptdesc,
+ enum pgtable_type pgtable_type)
+{
switch (pgtable_type) {
case TABLE_PTE:
BUG_ON(!pagetable_pte_ctor(mm, ptdesc));
@@ -555,26 +551,49 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm, gfp_t gfp,
pagetable_p4d_ctor(ptdesc);
break;
}
-
- return pa;
}
-static phys_addr_t
-pgd_pgtable_alloc_init_mm_gfp(enum pgtable_type pgtable_type, gfp_t gfp)
+static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
+ enum pgtable_type pgtable_type)
{
- return __pgd_pgtable_alloc(&init_mm, gfp, pgtable_type);
+ /* Page is zeroed by init_clear_pgtable() so don't duplicate effort. */
+ struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
+
+ if (!ptdesc)
+ return INVALID_PHYS_ADDR;
+
+ __pgd_pgtable_init(mm, ptdesc, pgtable_type);
+
+ return page_to_phys(ptdesc_page(ptdesc));
}
-static phys_addr_t __maybe_unused
+static phys_addr_t
pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
{
- return pgd_pgtable_alloc_init_mm_gfp(pgtable_type, GFP_PGTABLE_KERNEL);
+ return __pgd_pgtable_alloc(&init_mm, pgtable_type);
}
static phys_addr_t
pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
{
- return __pgd_pgtable_alloc(NULL, GFP_PGTABLE_KERNEL, pgtable_type);
+ return __pgd_pgtable_alloc(NULL, pgtable_type);
+}
+
+static phys_addr_t
+pgd_pgtable_get_preallocated(enum pgtable_type pgtable_type)
+{
+ struct ptdesc *ptdesc;
+
+ if (WARN_ON(split_pgtables_idx >= split_pgtables_count))
+ return INVALID_PHYS_ADDR;
+
+ ptdesc = split_pgtables[split_pgtables_idx++];
+ if (!ptdesc)
+ return INVALID_PHYS_ADDR;
+
+ __pgd_pgtable_init(&init_mm, ptdesc, pgtable_type);
+
+ return page_to_phys(ptdesc_page(ptdesc));
}
static void split_contpte(pte_t *ptep)
@@ -586,7 +605,8 @@ static void split_contpte(pte_t *ptep)
__set_pte(ptep, pte_mknoncont(__ptep_get(ptep)));
}
-static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
+static int split_pmd(pmd_t *pmdp, pmd_t pmd, pgtable_alloc_t pgtable_alloc,
+ bool to_cont)
{
pmdval_t tableprot = PMD_TYPE_TABLE | PMD_TABLE_UXN | PMD_TABLE_AF;
unsigned long pfn = pmd_pfn(pmd);
@@ -595,7 +615,7 @@ static int split_pmd(pmd_t *pmdp, pmd_t pmd, gfp_t gfp, bool to_cont)
pte_t *ptep;
int i;
- pte_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PTE, gfp);
+ pte_phys = pgtable_alloc(TABLE_PTE);
if (pte_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
ptep = (pte_t *)phys_to_virt(pte_phys);
@@ -630,7 +650,8 @@ static void split_contpmd(pmd_t *pmdp)
set_pmd(pmdp, pmd_mknoncont(pmdp_get(pmdp)));
}
-static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
+static int split_pud(pud_t *pudp, pud_t pud, pgtable_alloc_t pgtable_alloc,
+ bool to_cont)
{
pudval_t tableprot = PUD_TYPE_TABLE | PUD_TABLE_UXN | PUD_TABLE_AF;
unsigned int step = PMD_SIZE >> PAGE_SHIFT;
@@ -640,7 +661,7 @@ static int split_pud(pud_t *pudp, pud_t pud, gfp_t gfp, bool to_cont)
pmd_t *pmdp;
int i;
- pmd_phys = pgd_pgtable_alloc_init_mm_gfp(TABLE_PMD, gfp);
+ pmd_phys = pgtable_alloc(TABLE_PMD);
if (pmd_phys == INVALID_PHYS_ADDR)
return -ENOMEM;
pmdp = (pmd_t *)phys_to_virt(pmd_phys);
@@ -709,7 +730,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
if (!pud_present(pud))
goto out;
if (pud_leaf(pud)) {
- ret = split_pud(pudp, pud, GFP_PGTABLE_KERNEL, true);
+ ret = split_pud(pudp, pud, pgd_pgtable_alloc_init_mm, true);
if (ret)
goto out;
}
@@ -734,7 +755,7 @@ static int split_kernel_leaf_mapping_locked(unsigned long addr)
*/
if (ALIGN_DOWN(addr, PMD_SIZE) == addr)
goto out;
- ret = split_pmd(pmdp, pmd, GFP_PGTABLE_KERNEL, true);
+ ret = split_pmd(pmdp, pmd, pgd_pgtable_alloc_init_mm, true);
if (ret)
goto out;
}
@@ -832,12 +853,12 @@ int split_kernel_leaf_mapping(unsigned long start, unsigned long end)
static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
- gfp_t gfp = *(gfp_t *)walk->private;
+ pgtable_alloc_t *pgtable_alloc = walk->private;
pud_t pud = pudp_get(pudp);
int ret = 0;
if (pud_leaf(pud))
- ret = split_pud(pudp, pud, gfp, false);
+ ret = split_pud(pudp, pud, pgtable_alloc, false);
return ret;
}
@@ -845,14 +866,14 @@ static int split_to_ptes_pud_entry(pud_t *pudp, unsigned long addr,
static int split_to_ptes_pmd_entry(pmd_t *pmdp, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
- gfp_t gfp = *(gfp_t *)walk->private;
+ pgtable_alloc_t *pgtable_alloc = walk->private;
pmd_t pmd = pmdp_get(pmdp);
int ret = 0;
if (pmd_leaf(pmd)) {
if (pmd_cont(pmd))
split_contpmd(pmdp);
- ret = split_pmd(pmdp, pmd, gfp, false);
+ ret = split_pmd(pmdp, pmd, pgtable_alloc, false);
/*
* We have split the pmd directly to ptes so there is no need to
@@ -881,13 +902,15 @@ static const struct mm_walk_ops split_to_ptes_ops = {
.pte_entry = split_to_ptes_pte_entry,
};
-static int range_split_to_ptes(unsigned long start, unsigned long end, gfp_t gfp)
+static int range_split_to_ptes(unsigned long start, unsigned long end,
+ pgtable_alloc_t pgtable_alloc)
{
int ret;
arch_enter_lazy_mmu_mode();
ret = walk_kernel_page_table_range_lockless(start, end,
- &split_to_ptes_ops, NULL, &gfp);
+ &split_to_ptes_ops, NULL,
+ pgtable_alloc);
arch_leave_lazy_mmu_mode();
return ret;
@@ -904,6 +927,103 @@ static void __init init_idmap_kpti_bbml2_flag(void)
smp_mb();
}
+static int __init
+collect_to_split_pud_entry(pud_t *pudp, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pud_t pud = pudp_get(pudp);
+
+ if (pud_leaf(pud)) {
+ split_pgtables_count += 1 + PTRS_PER_PMD;
+ walk->action = ACTION_CONTINUE;
+ }
+
+ return 0;
+}
+
+static int __init
+collect_to_split_pmd_entry(pmd_t *pmdp, unsigned long addr,
+ unsigned long next, struct mm_walk *walk)
+{
+ pmd_t pmd = pmdp_get(pmdp);
+
+ if (pmd_leaf(pmd))
+ split_pgtables_count++;
+
+ walk->action = ACTION_CONTINUE;
+
+ return 0;
+}
+
+static void __init linear_map_free_split_pgtables(void)
+{
+ int i;
+
+ if (!split_pgtables_count || !split_pgtables)
+ goto skip_free;
+
+ for (i = split_pgtables_idx; i < split_pgtables_count; i++) {
+ if (split_pgtables[i])
+ pagetable_free(split_pgtables[i]);
+ }
+
+ kvfree(split_pgtables);
+
+skip_free:
+ split_pgtables = NULL;
+ split_pgtables_count = 0;
+ split_pgtables_idx = 0;
+}
+
+static int __init linear_map_prealloc_split_pgtables(void)
+{
+ int ret, i;
+ unsigned long lstart = _PAGE_OFFSET(vabits_actual);
+ unsigned long lend = PAGE_END;
+ unsigned long kstart = (unsigned long)lm_alias(_stext);
+ unsigned long kend = (unsigned long)lm_alias(__init_begin);
+
+ const struct mm_walk_ops collect_to_split_ops = {
+ .pud_entry = collect_to_split_pud_entry,
+ .pmd_entry = collect_to_split_pmd_entry
+ };
+
+ split_pgtables_idx = 0;
+ split_pgtables_count = 0;
+
+ ret = walk_kernel_page_table_range_lockless(lstart, kstart,
+ &collect_to_split_ops,
+ NULL, NULL);
+ if (!ret)
+ ret = walk_kernel_page_table_range_lockless(kend, lend,
+ &collect_to_split_ops,
+ NULL, NULL);
+ if (ret || !split_pgtables_count)
+ goto error;
+
+ ret = -ENOMEM;
+
+ split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!split_pgtables)
+ goto error;
+
+ for (i = 0; i < split_pgtables_count; i++) {
+ /* The page table will be filled during splitting, so zeroing it is unnecessary. */
+ split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
+ if (!split_pgtables[i])
+ goto error;
+ }
+
+ ret = 0;
+
+error:
+ if (ret)
+ linear_map_free_split_pgtables();
+
+ return ret;
+}
+
static int __init linear_map_split_to_ptes(void *__unused)
{
/*
@@ -929,9 +1049,9 @@ static int __init linear_map_split_to_ptes(void *__unused)
* PTE. The kernel alias remains static throughout runtime so
* can continue to be safely mapped with large mappings.
*/
- ret = range_split_to_ptes(lstart, kstart, GFP_ATOMIC);
+ ret = range_split_to_ptes(lstart, kstart, pgd_pgtable_get_preallocated);
if (!ret)
- ret = range_split_to_ptes(kend, lend, GFP_ATOMIC);
+ ret = range_split_to_ptes(kend, lend, pgd_pgtable_get_preallocated);
if (ret)
panic("Failed to split linear map\n");
flush_tlb_kernel_range(lstart, lend);
@@ -964,10 +1084,16 @@ static int __init linear_map_split_to_ptes(void *__unused)
void __init linear_map_maybe_split_to_ptes(void)
{
- if (linear_map_requires_bbml2 && !system_supports_bbml2_noabort()) {
- init_idmap_kpti_bbml2_flag();
- stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask);
- }
+ if (!linear_map_requires_bbml2 || system_supports_bbml2_noabort())
+ return;
+
+ if (linear_map_prealloc_split_pgtables())
+ panic("Failed to split linear map\n");
+
+ init_idmap_kpti_bbml2_flag();
+ stop_machine(linear_map_split_to_ptes, NULL, cpu_online_mask);
+
+ linear_map_free_split_pgtables();
}
/*
@@ -1098,7 +1224,7 @@ bool arch_kfence_init_pool(void)
return true;
mutex_lock(&pgtable_split_lock);
- ret = range_split_to_ptes(start, end, GFP_PGTABLE_KERNEL);
+ ret = range_split_to_ptes(start, end, pgd_pgtable_alloc_init_mm);
mutex_unlock(&pgtable_split_lock);
/*
--
LEVI:{C3F47F37-75D8-414A-A8BA-3980EC8A46D7}
On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
> +static int __init linear_map_prealloc_split_pgtables(void)
> +{
> + int ret, i;
> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> + unsigned long lend = PAGE_END;
> + unsigned long kstart = (unsigned long)lm_alias(_stext);
> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
> +
> + const struct mm_walk_ops collect_to_split_ops = {
> + .pud_entry = collect_to_split_pud_entry,
> + .pmd_entry = collect_to_split_pmd_entry
> + };
Why do we need to rewalk the page-table here instead of collating the
number of block mappings we put down when creating the linear map in
the first place?
> + split_pgtables_idx = 0;
> + split_pgtables_count = 0;
> +
> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> + &collect_to_split_ops,
> + NULL, NULL);
> + if (!ret)
> + ret = walk_kernel_page_table_range_lockless(kend, lend,
> + &collect_to_split_ops,
> + NULL, NULL);
> + if (ret || !split_pgtables_count)
> + goto error;
> +
> + ret = -ENOMEM;
> +
> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
> + GFP_KERNEL | __GFP_ZERO);
> + if (!split_pgtables)
> + goto error;
> +
> + for (i = 0; i < split_pgtables_count; i++) {
> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
> + if (!split_pgtables[i])
> + goto error;
This looks potentially expensive on the boot path and only gets worse as
the amount of memory grows. Maybe we should predicate this preallocation
on preempt-rt?
Will
Hi Will,
> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
> > +static int __init linear_map_prealloc_split_pgtables(void)
> > +{
> > + int ret, i;
> > + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> > + unsigned long lend = PAGE_END;
> > + unsigned long kstart = (unsigned long)lm_alias(_stext);
> > + unsigned long kend = (unsigned long)lm_alias(__init_begin);
> > +
> > + const struct mm_walk_ops collect_to_split_ops = {
> > + .pud_entry = collect_to_split_pud_entry,
> > + .pmd_entry = collect_to_split_pmd_entry
> > + };
>
> Why do we need to rewalk the page-table here instead of collating the
> number of block mappings we put down when creating the linear map in
> the first place?
First, linear alias of the [_text, __init_begin) is not a target for
the split and it also seems strange to me to add code inside alloc_init_XXX()
that both checks an address range and counts to get the number of block mappings.
Second, for a future feature,
I hope to add some code to split "specfic" area to be spilt e.x)
to set a specific pkey for specific area.
In this case, it's useful to rewalk the page-table with the specific
range to get the number of block mapping.
>
> > + split_pgtables_idx = 0;
> > + split_pgtables_count = 0;
> > +
> > + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> > + &collect_to_split_ops,
> > + NULL, NULL);
> > + if (!ret)
> > + ret = walk_kernel_page_table_range_lockless(kend, lend,
> > + &collect_to_split_ops,
> > + NULL, NULL);
> > + if (ret || !split_pgtables_count)
> > + goto error;
> > +
> > + ret = -ENOMEM;
> > +
> > + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
> > + GFP_KERNEL | __GFP_ZERO);
> > + if (!split_pgtables)
> > + goto error;
> > +
> > + for (i = 0; i < split_pgtables_count; i++) {
> > + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
> > + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
> > + if (!split_pgtables[i])
> > + goto error;
>
> This looks potentially expensive on the boot path and only gets worse as
> the amount of memory grows. Maybe we should predicate this preallocation
> on preempt-rt?
Agree. then I'll apply pre-allocation with PREEMPT_RT only.
Thanks for your review.
--
Sincerely,
Yeoreum Yun
On 19/01/2026 21:24, Yeoreum Yun wrote:
> Hi Will,
>
>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
>>> +static int __init linear_map_prealloc_split_pgtables(void)
>>> +{
>>> + int ret, i;
>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
>>> + unsigned long lend = PAGE_END;
>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
>>> +
>>> + const struct mm_walk_ops collect_to_split_ops = {
>>> + .pud_entry = collect_to_split_pud_entry,
>>> + .pmd_entry = collect_to_split_pmd_entry
>>> + };
>>
>> Why do we need to rewalk the page-table here instead of collating the
>> number of block mappings we put down when creating the linear map in
>> the first place?
That's a good point; perhaps we can reuse the counters that this series introduces?
https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
>
> First, linear alias of the [_text, __init_begin) is not a target for
> the split and it also seems strange to me to add code inside alloc_init_XXX()
> that both checks an address range and counts to get the number of block mappings.
>
> Second, for a future feature,
> I hope to add some code to split "specfic" area to be spilt e.x)
> to set a specific pkey for specific area.
Could you give more detail on this? My working assumption is that either the
system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
linear map. If it does, we already have logic to split parts of the linear map
when needed.
>
> In this case, it's useful to rewalk the page-table with the specific
> range to get the number of block mapping.
>
>>
>>> + split_pgtables_idx = 0;
>>> + split_pgtables_count = 0;
>>> +
>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
>>> + &collect_to_split_ops,
>>> + NULL, NULL);
>>> + if (!ret)
>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
>>> + &collect_to_split_ops,
>>> + NULL, NULL);
>>> + if (ret || !split_pgtables_count)
>>> + goto error;
>>> +
>>> + ret = -ENOMEM;
>>> +
>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
>>> + GFP_KERNEL | __GFP_ZERO);
>>> + if (!split_pgtables)
>>> + goto error;
>>> +
>>> + for (i = 0; i < split_pgtables_count; i++) {
>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
>>> + if (!split_pgtables[i])
>>> + goto error;
>>
>> This looks potentially expensive on the boot path and only gets worse as
>> the amount of memory grows. Maybe we should predicate this preallocation
>> on preempt-rt?
>
> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
I guess I'm missing something obvious but I don't understand the problem here...
We are only deferring the allocation of all these pgtables, so the cost is
neutral surely? Had we correctly guessed that the system doesn't support BBML2
earlier, we would have had to allocate all these pgtables earlier.
Another way to look at it is that we are still allocating the same number of
pgtables in the existing fallback path, it's just that we are doing it inside
the stop_machine().
My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
with significantly less testing...
Thanks,
Ryan
>
> Thanks for your review.
>
> --
> Sincerely,
> Yeoreum Yun
Hi Ryan
> On 19/01/2026 21:24, Yeoreum Yun wrote:
> > Hi Will,
> >
> >> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
> >>> +static int __init linear_map_prealloc_split_pgtables(void)
> >>> +{
> >>> + int ret, i;
> >>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> >>> + unsigned long lend = PAGE_END;
> >>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
> >>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
> >>> +
> >>> + const struct mm_walk_ops collect_to_split_ops = {
> >>> + .pud_entry = collect_to_split_pud_entry,
> >>> + .pmd_entry = collect_to_split_pmd_entry
> >>> + };
> >>
> >> Why do we need to rewalk the page-table here instead of collating the
> >> number of block mappings we put down when creating the linear map in
> >> the first place?
>
> That's a good point; perhaps we can reuse the counters that this series introduces?
>
> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
>
> >
> > First, linear alias of the [_text, __init_begin) is not a target for
> > the split and it also seems strange to me to add code inside alloc_init_XXX()
> > that both checks an address range and counts to get the number of block mappings.
> >
> > Second, for a future feature,
> > I hope to add some code to split "specfic" area to be spilt e.x)
> > to set a specific pkey for specific area.
>
> Could you give more detail on this? My working assumption is that either the
> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
> linear map. If it does, we already have logic to split parts of the linear map
> when needed.
This is not for a linear mapping case. but for a "kernel text area".
As a draft, I want to mark some of kernel code can executable
both kernel and eBPF program.
(I'm trying to make eBPF program non-executable kernel code directly
with POE feature).
For this "executable area" both of kernel and eBPF program
-- typical example is exception entry, It need to split that specific
range and mark them with special POE index.
>
> >
> > In this case, it's useful to rewalk the page-table with the specific
> > range to get the number of block mapping.
> >
> >>
> >>> + split_pgtables_idx = 0;
> >>> + split_pgtables_count = 0;
> >>> +
> >>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> >>> + &collect_to_split_ops,
> >>> + NULL, NULL);
> >>> + if (!ret)
> >>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
> >>> + &collect_to_split_ops,
> >>> + NULL, NULL);
> >>> + if (ret || !split_pgtables_count)
> >>> + goto error;
> >>> +
> >>> + ret = -ENOMEM;
> >>> +
> >>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
> >>> + GFP_KERNEL | __GFP_ZERO);
> >>> + if (!split_pgtables)
> >>> + goto error;
> >>> +
> >>> + for (i = 0; i < split_pgtables_count; i++) {
> >>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
> >>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
> >>> + if (!split_pgtables[i])
> >>> + goto error;
> >>
> >> This looks potentially expensive on the boot path and only gets worse as
> >> the amount of memory grows. Maybe we should predicate this preallocation
> >> on preempt-rt?
> >
> > Agree. then I'll apply pre-allocation with PREEMPT_RT only.
>
> I guess I'm missing something obvious but I don't understand the problem here...
> We are only deferring the allocation of all these pgtables, so the cost is
> neutral surely? Had we correctly guessed that the system doesn't support BBML2
> earlier, we would have had to allocate all these pgtables earlier.
>
> Another way to look at it is that we are still allocating the same number of
> pgtables in the existing fallback path, it's just that we are doing it inside
> the stop_machine().
>
> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
> with significantly less testing...
IIUC, Will's mention is additional memory allocation for
"split_pgtables" where saved "pre-allocate" page tables.
As the memory increase, definitely this size would increase the cost.
And this cost need not to burden for !PREEMPT_RT since
it can use memory allocation in stop_machine() with GFP_ATOMIC.
But I also agree in the aspect that if that cost not much of huge,
It's also convincing and additionally, as I mentioned in another thread,
It would be good not to give a hallucination GFP_ATOMIC is fine for
everywhere even in the PREEMPT_RT.
--
Sincerely,
Yeoreum Yun
On 1/20/26 1:29 AM, Yeoreum Yun wrote:
> Hi Ryan
>> On 19/01/2026 21:24, Yeoreum Yun wrote:
>>> Hi Will,
>>>
>>>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
>>>>> +static int __init linear_map_prealloc_split_pgtables(void)
>>>>> +{
>>>>> + int ret, i;
>>>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
>>>>> + unsigned long lend = PAGE_END;
>>>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
>>>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
>>>>> +
>>>>> + const struct mm_walk_ops collect_to_split_ops = {
>>>>> + .pud_entry = collect_to_split_pud_entry,
>>>>> + .pmd_entry = collect_to_split_pmd_entry
>>>>> + };
>>>> Why do we need to rewalk the page-table here instead of collating the
>>>> number of block mappings we put down when creating the linear map in
>>>> the first place?
>> That's a good point; perhaps we can reuse the counters that this series introduces?
>>
>> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
Yeah, good point. It seems feasible to me. The patch can count how many
PUD/CONT_PMD/PMD mappings, we can calculate how many page table pages
need to be allocated based on those counters.
>>
>>> First, linear alias of the [_text, __init_begin) is not a target for
>>> the split and it also seems strange to me to add code inside alloc_init_XXX()
>>> that both checks an address range and counts to get the number of block mappings.
IIUC, it should be not that hard to exclude kernel mappings. We know
kernel_start and kernel_end, so you should be able to maintain a set of
counters for kernel, then minus them when you do the calculation for how
many page table pages need to be allocated.
>>>
>>> Second, for a future feature,
>>> I hope to add some code to split "specfic" area to be spilt e.x)
>>> to set a specific pkey for specific area.
>> Could you give more detail on this? My working assumption is that either the
>> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
>> linear map. If it does, we already have logic to split parts of the linear map
>> when needed.
> This is not for a linear mapping case. but for a "kernel text area".
> As a draft, I want to mark some of kernel code can executable
> both kernel and eBPF program.
> (I'm trying to make eBPF program non-executable kernel code directly
> with POE feature).
> For this "executable area" both of kernel and eBPF program
> -- typical example is exception entry, It need to split that specific
> range and mark them with special POE index.
IIUC, you want to change POE attributes for some kernel area (mainly in
vmalloc address space). It sounds like you can do something like
set_memory_rox(), but just split vmalloc address mapping instead of
linear mapping. Or you need preallocate page table pages in this case?
Anyway we can have more simple way to count block mappings for splitting
linear mapping, it seems not necessary to re-walk page table again IMHO.
Thanks,
Yang
>
>>> In this case, it's useful to rewalk the page-table with the specific
>>> range to get the number of block mapping.
>>>
>>>>> + split_pgtables_idx = 0;
>>>>> + split_pgtables_count = 0;
>>>>> +
>>>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
>>>>> + &collect_to_split_ops,
>>>>> + NULL, NULL);
>>>>> + if (!ret)
>>>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
>>>>> + &collect_to_split_ops,
>>>>> + NULL, NULL);
>>>>> + if (ret || !split_pgtables_count)
>>>>> + goto error;
>>>>> +
>>>>> + ret = -ENOMEM;
>>>>> +
>>>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
>>>>> + GFP_KERNEL | __GFP_ZERO);
>>>>> + if (!split_pgtables)
>>>>> + goto error;
>>>>> +
>>>>> + for (i = 0; i < split_pgtables_count; i++) {
>>>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
>>>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
>>>>> + if (!split_pgtables[i])
>>>>> + goto error;
>>>> This looks potentially expensive on the boot path and only gets worse as
>>>> the amount of memory grows. Maybe we should predicate this preallocation
>>>> on preempt-rt?
>>> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
>> I guess I'm missing something obvious but I don't understand the problem here...
>> We are only deferring the allocation of all these pgtables, so the cost is
>> neutral surely? Had we correctly guessed that the system doesn't support BBML2
>> earlier, we would have had to allocate all these pgtables earlier.
>>
>> Another way to look at it is that we are still allocating the same number of
>> pgtables in the existing fallback path, it's just that we are doing it inside
>> the stop_machine().
>>
>> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
>> with significantly less testing...
> IIUC, Will's mention is additional memory allocation for
> "split_pgtables" where saved "pre-allocate" page tables.
> As the memory increase, definitely this size would increase the cost.
>
> And this cost need not to burden for !PREEMPT_RT since
> it can use memory allocation in stop_machine() with GFP_ATOMIC.
>
> But I also agree in the aspect that if that cost not much of huge,
> It's also convincing and additionally, as I mentioned in another thread,
> It would be good not to give a hallucination GFP_ATOMIC is fine for
> everywhere even in the PREEMPT_RT.
>
> --
> Sincerely,
> Yeoreum Yun
Hi Yang,
>
>
> On 1/20/26 1:29 AM, Yeoreum Yun wrote:
> > Hi Ryan
> > > On 19/01/2026 21:24, Yeoreum Yun wrote:
> > > > Hi Will,
> > > >
> > > > > On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
> > > > > > +static int __init linear_map_prealloc_split_pgtables(void)
> > > > > > +{
> > > > > > + int ret, i;
> > > > > > + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> > > > > > + unsigned long lend = PAGE_END;
> > > > > > + unsigned long kstart = (unsigned long)lm_alias(_stext);
> > > > > > + unsigned long kend = (unsigned long)lm_alias(__init_begin);
> > > > > > +
> > > > > > + const struct mm_walk_ops collect_to_split_ops = {
> > > > > > + .pud_entry = collect_to_split_pud_entry,
> > > > > > + .pmd_entry = collect_to_split_pmd_entry
> > > > > > + };
> > > > > Why do we need to rewalk the page-table here instead of collating the
> > > > > number of block mappings we put down when creating the linear map in
> > > > > the first place?
> > > That's a good point; perhaps we can reuse the counters that this series introduces?
> > >
> > > https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
>
> Yeah, good point. It seems feasible to me. The patch can count how many
> PUD/CONT_PMD/PMD mappings, we can calculate how many page table pages need
> to be allocated based on those counters.
>
> > >
> > > > First, linear alias of the [_text, __init_begin) is not a target for
> > > > the split and it also seems strange to me to add code inside alloc_init_XXX()
> > > > that both checks an address range and counts to get the number of block mappings.
>
> IIUC, it should be not that hard to exclude kernel mappings. We know
> kernel_start and kernel_end, so you should be able to maintain a set of
> counters for kernel, then minus them when you do the calculation for how
> many page table pages need to be allocated.
As you said, this is not difficult. However, what I meant was that
this collection would be done in alloc_init_XXX(), and in that case,
collecting the number of block mappings for the range
[kernel_start, kernel_end) and adding conditional logic in
alloc_init_XXX() seems a bit odd.
That said, for potential future use cases involving splitting specific ranges,
I don’t think having this kind of collection is necessarily a bad idea.
>
> > > >
> > > > Second, for a future feature,
> > > > I hope to add some code to split "specfic" area to be spilt e.x)
> > > > to set a specific pkey for specific area.
> > > Could you give more detail on this? My working assumption is that either the
> > > system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
> > > linear map. If it does, we already have logic to split parts of the linear map
> > > when needed.
> > This is not for a linear mapping case. but for a "kernel text area".
> > As a draft, I want to mark some of kernel code can executable
> > both kernel and eBPF program.
> > (I'm trying to make eBPF program non-executable kernel code directly
> > with POE feature).
> > For this "executable area" both of kernel and eBPF program
> > -- typical example is exception entry, It need to split that specific
> > range and mark them with special POE index.
>
> IIUC, you want to change POE attributes for some kernel area (mainly in
> vmalloc address space). It sounds like you can do something like
> set_memory_rox(), but just split vmalloc address mapping instead of linear
> mapping. Or you need preallocate page table pages in this case? Anyway we
> can have more simple way to count block mappings for splitting linear
> mapping, it seems not necessary to re-walk page table again IMHO.
As I said, it isn't not only vmalloc address mapping but also
"kimage" mapping too.
In this case, it need to be split to set the specific code area
with specific POE index.
The preallocate page is for spliting via "stop_machine()"
since page table allocation with GFP_ATOMIC couldn't be in case of
PREEMPT_RT in stop_machine().
Also, the spliting text-code area to set specific POE index would be
done via stop_machine() so, the collection is required.
>
> >
> > > > In this case, it's useful to rewalk the page-table with the specific
> > > > range to get the number of block mapping.
> > > >
> > > > > > + split_pgtables_idx = 0;
> > > > > > + split_pgtables_count = 0;
> > > > > > +
> > > > > > + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> > > > > > + &collect_to_split_ops,
> > > > > > + NULL, NULL);
> > > > > > + if (!ret)
> > > > > > + ret = walk_kernel_page_table_range_lockless(kend, lend,
> > > > > > + &collect_to_split_ops,
> > > > > > + NULL, NULL);
> > > > > > + if (ret || !split_pgtables_count)
> > > > > > + goto error;
> > > > > > +
> > > > > > + ret = -ENOMEM;
> > > > > > +
> > > > > > + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
> > > > > > + GFP_KERNEL | __GFP_ZERO);
> > > > > > + if (!split_pgtables)
> > > > > > + goto error;
> > > > > > +
> > > > > > + for (i = 0; i < split_pgtables_count; i++) {
> > > > > > + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
> > > > > > + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
> > > > > > + if (!split_pgtables[i])
> > > > > > + goto error;
> > > > > This looks potentially expensive on the boot path and only gets worse as
> > > > > the amount of memory grows. Maybe we should predicate this preallocation
> > > > > on preempt-rt?
> > > > Agree. then I'll apply pre-allocation with PREEMPT_RT only.
> > > I guess I'm missing something obvious but I don't understand the problem here...
> > > We are only deferring the allocation of all these pgtables, so the cost is
> > > neutral surely? Had we correctly guessed that the system doesn't support BBML2
> > > earlier, we would have had to allocate all these pgtables earlier.
> > >
> > > Another way to look at it is that we are still allocating the same number of
> > > pgtables in the existing fallback path, it's just that we are doing it inside
> > > the stop_machine().
> > >
> > > My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
> > > with significantly less testing...
> > IIUC, Will's mention is additional memory allocation for
> > "split_pgtables" where saved "pre-allocate" page tables.
> > As the memory increase, definitely this size would increase the cost.
> >
> > And this cost need not to burden for !PREEMPT_RT since
> > it can use memory allocation in stop_machine() with GFP_ATOMIC.
> >
> > But I also agree in the aspect that if that cost not much of huge,
> > It's also convincing and additionally, as I mentioned in another thread,
> > It would be good not to give a hallucination GFP_ATOMIC is fine for
> > everywhere even in the PREEMPT_RT.
> >
> > --
> > Sincerely,
> > Yeoreum Yun
>
--
Sincerely,
Yeoreum Yun
On 1/20/26 3:01 PM, Yeoreum Yun wrote:
> Hi Yang,
>>
>> On 1/20/26 1:29 AM, Yeoreum Yun wrote:
>>> Hi Ryan
>>>> On 19/01/2026 21:24, Yeoreum Yun wrote:
>>>>> Hi Will,
>>>>>
>>>>>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
>>>>>>> +static int __init linear_map_prealloc_split_pgtables(void)
>>>>>>> +{
>>>>>>> + int ret, i;
>>>>>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
>>>>>>> + unsigned long lend = PAGE_END;
>>>>>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
>>>>>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
>>>>>>> +
>>>>>>> + const struct mm_walk_ops collect_to_split_ops = {
>>>>>>> + .pud_entry = collect_to_split_pud_entry,
>>>>>>> + .pmd_entry = collect_to_split_pmd_entry
>>>>>>> + };
>>>>>> Why do we need to rewalk the page-table here instead of collating the
>>>>>> number of block mappings we put down when creating the linear map in
>>>>>> the first place?
>>>> That's a good point; perhaps we can reuse the counters that this series introduces?
>>>>
>>>> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
>> Yeah, good point. It seems feasible to me. The patch can count how many
>> PUD/CONT_PMD/PMD mappings, we can calculate how many page table pages need
>> to be allocated based on those counters.
>>
>>>>> First, linear alias of the [_text, __init_begin) is not a target for
>>>>> the split and it also seems strange to me to add code inside alloc_init_XXX()
>>>>> that both checks an address range and counts to get the number of block mappings.
>> IIUC, it should be not that hard to exclude kernel mappings. We know
>> kernel_start and kernel_end, so you should be able to maintain a set of
>> counters for kernel, then minus them when you do the calculation for how
>> many page table pages need to be allocated.
> As you said, this is not difficult. However, what I meant was that
> this collection would be done in alloc_init_XXX(), and in that case,
> collecting the number of block mappings for the range
> [kernel_start, kernel_end) and adding conditional logic in
> alloc_init_XXX() seems a bit odd.
> That said, for potential future use cases involving splitting specific ranges,
> I don’t think having this kind of collection is necessarily a bad idea.
I'm not sure whether we are on the same page or not. IIUC the point is
collecting the counts of PUD/CONT_PMD/PMD by re-walking page table is
sub optimal and unnecessary for this usecase (repainting linear
mapping). We can simply know the counts at linear mapping creation time.
I don't mean it is a bad idea for your future projects if it is necessary.
Thanks,
Yang
>
>>>>> Second, for a future feature,
>>>>> I hope to add some code to split "specfic" area to be spilt e.x)
>>>>> to set a specific pkey for specific area.
>>>> Could you give more detail on this? My working assumption is that either the
>>>> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
>>>> linear map. If it does, we already have logic to split parts of the linear map
>>>> when needed.
>>> This is not for a linear mapping case. but for a "kernel text area".
>>> As a draft, I want to mark some of kernel code can executable
>>> both kernel and eBPF program.
>>> (I'm trying to make eBPF program non-executable kernel code directly
>>> with POE feature).
>>> For this "executable area" both of kernel and eBPF program
>>> -- typical example is exception entry, It need to split that specific
>>> range and mark them with special POE index.
>> IIUC, you want to change POE attributes for some kernel area (mainly in
>> vmalloc address space). It sounds like you can do something like
>> set_memory_rox(), but just split vmalloc address mapping instead of linear
>> mapping. Or you need preallocate page table pages in this case? Anyway we
>> can have more simple way to count block mappings for splitting linear
>> mapping, it seems not necessary to re-walk page table again IMHO.
> As I said, it isn't not only vmalloc address mapping but also
> "kimage" mapping too.
> In this case, it need to be split to set the specific code area
> with specific POE index.
>
> The preallocate page is for spliting via "stop_machine()"
> since page table allocation with GFP_ATOMIC couldn't be in case of
> PREEMPT_RT in stop_machine().
>
> Also, the spliting text-code area to set specific POE index would be
> done via stop_machine() so, the collection is required.
>
>>>>> In this case, it's useful to rewalk the page-table with the specific
>>>>> range to get the number of block mapping.
>>>>>
>>>>>>> + split_pgtables_idx = 0;
>>>>>>> + split_pgtables_count = 0;
>>>>>>> +
>>>>>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
>>>>>>> + &collect_to_split_ops,
>>>>>>> + NULL, NULL);
>>>>>>> + if (!ret)
>>>>>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
>>>>>>> + &collect_to_split_ops,
>>>>>>> + NULL, NULL);
>>>>>>> + if (ret || !split_pgtables_count)
>>>>>>> + goto error;
>>>>>>> +
>>>>>>> + ret = -ENOMEM;
>>>>>>> +
>>>>>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
>>>>>>> + GFP_KERNEL | __GFP_ZERO);
>>>>>>> + if (!split_pgtables)
>>>>>>> + goto error;
>>>>>>> +
>>>>>>> + for (i = 0; i < split_pgtables_count; i++) {
>>>>>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
>>>>>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
>>>>>>> + if (!split_pgtables[i])
>>>>>>> + goto error;
>>>>>> This looks potentially expensive on the boot path and only gets worse as
>>>>>> the amount of memory grows. Maybe we should predicate this preallocation
>>>>>> on preempt-rt?
>>>>> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
>>>> I guess I'm missing something obvious but I don't understand the problem here...
>>>> We are only deferring the allocation of all these pgtables, so the cost is
>>>> neutral surely? Had we correctly guessed that the system doesn't support BBML2
>>>> earlier, we would have had to allocate all these pgtables earlier.
>>>>
>>>> Another way to look at it is that we are still allocating the same number of
>>>> pgtables in the existing fallback path, it's just that we are doing it inside
>>>> the stop_machine().
>>>>
>>>> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
>>>> with significantly less testing...
>>> IIUC, Will's mention is additional memory allocation for
>>> "split_pgtables" where saved "pre-allocate" page tables.
>>> As the memory increase, definitely this size would increase the cost.
>>>
>>> And this cost need not to burden for !PREEMPT_RT since
>>> it can use memory allocation in stop_machine() with GFP_ATOMIC.
>>>
>>> But I also agree in the aspect that if that cost not much of huge,
>>> It's also convincing and additionally, as I mentioned in another thread,
>>> It would be good not to give a hallucination GFP_ATOMIC is fine for
>>> everywhere even in the PREEMPT_RT.
>>>
>>> --
>>> Sincerely,
>>> Yeoreum Yun
> --
> Sincerely,
> Yeoreum Yun
Hi Yang,
>
>
> On 1/20/26 3:01 PM, Yeoreum Yun wrote:
> > Hi Yang,
> > >
> > > On 1/20/26 1:29 AM, Yeoreum Yun wrote:
> > > > Hi Ryan
> > > > > On 19/01/2026 21:24, Yeoreum Yun wrote:
> > > > > > Hi Will,
> > > > > >
> > > > > > > On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
> > > > > > > > +static int __init linear_map_prealloc_split_pgtables(void)
> > > > > > > > +{
> > > > > > > > + int ret, i;
> > > > > > > > + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> > > > > > > > + unsigned long lend = PAGE_END;
> > > > > > > > + unsigned long kstart = (unsigned long)lm_alias(_stext);
> > > > > > > > + unsigned long kend = (unsigned long)lm_alias(__init_begin);
> > > > > > > > +
> > > > > > > > + const struct mm_walk_ops collect_to_split_ops = {
> > > > > > > > + .pud_entry = collect_to_split_pud_entry,
> > > > > > > > + .pmd_entry = collect_to_split_pmd_entry
> > > > > > > > + };
> > > > > > > Why do we need to rewalk the page-table here instead of collating the
> > > > > > > number of block mappings we put down when creating the linear map in
> > > > > > > the first place?
> > > > > That's a good point; perhaps we can reuse the counters that this series introduces?
> > > > >
> > > > > https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
> > > Yeah, good point. It seems feasible to me. The patch can count how many
> > > PUD/CONT_PMD/PMD mappings, we can calculate how many page table pages need
> > > to be allocated based on those counters.
> > >
> > > > > > First, linear alias of the [_text, __init_begin) is not a target for
> > > > > > the split and it also seems strange to me to add code inside alloc_init_XXX()
> > > > > > that both checks an address range and counts to get the number of block mappings.
> > > IIUC, it should be not that hard to exclude kernel mappings. We know
> > > kernel_start and kernel_end, so you should be able to maintain a set of
> > > counters for kernel, then minus them when you do the calculation for how
> > > many page table pages need to be allocated.
> > As you said, this is not difficult. However, what I meant was that
> > this collection would be done in alloc_init_XXX(), and in that case,
> > collecting the number of block mappings for the range
> > [kernel_start, kernel_end) and adding conditional logic in
> > alloc_init_XXX() seems a bit odd.
> > That said, for potential future use cases involving splitting specific ranges,
> > I don’t think having this kind of collection is necessarily a bad idea.
>
> I'm not sure whether we are on the same page or not. IIUC the point is
> collecting the counts of PUD/CONT_PMD/PMD by re-walking page table is sub
> optimal and unnecessary for this usecase (repainting linear mapping). We can
> simply know the counts at linear mapping creation time.
What I meant was that the counts collected via dm_meminfo already
include kernel aliases, so adding separate code to
count kernel aliases again seems odd to me.
That said, I do not disagree with the efficiency of
using the already collected counts.
Thanks!
[...]
--
Sincerely,
Yeoreum Yun
On 20/01/2026 09:29, Yeoreum Yun wrote:
> Hi Ryan
>> On 19/01/2026 21:24, Yeoreum Yun wrote:
>>> Hi Will,
>>>
>>>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
>>>>> +static int __init linear_map_prealloc_split_pgtables(void)
>>>>> +{
>>>>> + int ret, i;
>>>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
>>>>> + unsigned long lend = PAGE_END;
>>>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
>>>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
>>>>> +
>>>>> + const struct mm_walk_ops collect_to_split_ops = {
>>>>> + .pud_entry = collect_to_split_pud_entry,
>>>>> + .pmd_entry = collect_to_split_pmd_entry
>>>>> + };
>>>>
>>>> Why do we need to rewalk the page-table here instead of collating the
>>>> number of block mappings we put down when creating the linear map in
>>>> the first place?
>>
>> That's a good point; perhaps we can reuse the counters that this series introduces?
>>
>> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
>>
>>>
>>> First, linear alias of the [_text, __init_begin) is not a target for
>>> the split and it also seems strange to me to add code inside alloc_init_XXX()
>>> that both checks an address range and counts to get the number of block mappings.
>>>
>>> Second, for a future feature,
>>> I hope to add some code to split "specfic" area to be spilt e.x)
>>> to set a specific pkey for specific area.
>>
>> Could you give more detail on this? My working assumption is that either the
>> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
>> linear map. If it does, we already have logic to split parts of the linear map
>> when needed.
>
> This is not for a linear mapping case. but for a "kernel text area".
> As a draft, I want to mark some of kernel code can executable
> both kernel and eBPF program.
> (I'm trying to make eBPF program non-executable kernel code directly
> with POE feature).
> For this "executable area" both of kernel and eBPF program
> -- typical example is exception entry, It need to split that specific
> range and mark them with special POE index.
Ahh yes, I recall you mentioning this a while back (although I confess all the
deatils have fallen out of my head). You'd need to make sure you're definitely
not splitting an area of text that the secondary CPUs are executing while they
are being held in the pen, since at least one of those CPUs doesn't support BBML2.
>
>>
>>>
>>> In this case, it's useful to rewalk the page-table with the specific
>>> range to get the number of block mapping.
>>>
>>>>
>>>>> + split_pgtables_idx = 0;
>>>>> + split_pgtables_count = 0;
>>>>> +
>>>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
>>>>> + &collect_to_split_ops,
>>>>> + NULL, NULL);
>>>>> + if (!ret)
>>>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
>>>>> + &collect_to_split_ops,
>>>>> + NULL, NULL);
>>>>> + if (ret || !split_pgtables_count)
>>>>> + goto error;
>>>>> +
>>>>> + ret = -ENOMEM;
>>>>> +
>>>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
>>>>> + GFP_KERNEL | __GFP_ZERO);
>>>>> + if (!split_pgtables)
>>>>> + goto error;
>>>>> +
>>>>> + for (i = 0; i < split_pgtables_count; i++) {
>>>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
>>>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
>>>>> + if (!split_pgtables[i])
>>>>> + goto error;
>>>>
>>>> This looks potentially expensive on the boot path and only gets worse as
>>>> the amount of memory grows. Maybe we should predicate this preallocation
>>>> on preempt-rt?
>>>
>>> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
>>
>> I guess I'm missing something obvious but I don't understand the problem here...
>> We are only deferring the allocation of all these pgtables, so the cost is
>> neutral surely? Had we correctly guessed that the system doesn't support BBML2
>> earlier, we would have had to allocate all these pgtables earlier.
>>
>> Another way to look at it is that we are still allocating the same number of
>> pgtables in the existing fallback path, it's just that we are doing it inside
>> the stop_machine().
>>
>> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
>> with significantly less testing...
>
> IIUC, Will's mention is additional memory allocation for
> "split_pgtables" where saved "pre-allocate" page tables.
> As the memory increase, definitely this size would increase the cost.
Err, so you're referring to the extra kvmalloc()? I don't think that's a big
deal is it? you get 512 pointers per page. So the amortized cost is 1/512= 0.2%?
I suspect we have both misunderstood Will's point...
Thanks,
Ryan
>
> And this cost need not to burden for !PREEMPT_RT since
> it can use memory allocation in stop_machine() with GFP_ATOMIC.
>
> But I also agree in the aspect that if that cost not much of huge,
> It's also convincing and additionally, as I mentioned in another thread,
> It would be good not to give a hallucination GFP_ATOMIC is fine for
> everywhere even in the PREEMPT_RT.
>
> --
> Sincerely,
> Yeoreum Yun
On Tue, Jan 20, 2026 at 10:40:30AM +0000, Ryan Roberts wrote:
> On 20/01/2026 09:29, Yeoreum Yun wrote:
> > Hi Ryan
> >> On 19/01/2026 21:24, Yeoreum Yun wrote:
> >>> Hi Will,
> >>>
> >>>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
> >>>>> +static int __init linear_map_prealloc_split_pgtables(void)
> >>>>> +{
> >>>>> + int ret, i;
> >>>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> >>>>> + unsigned long lend = PAGE_END;
> >>>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
> >>>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
> >>>>> +
> >>>>> + const struct mm_walk_ops collect_to_split_ops = {
> >>>>> + .pud_entry = collect_to_split_pud_entry,
> >>>>> + .pmd_entry = collect_to_split_pmd_entry
> >>>>> + };
> >>>>
> >>>> Why do we need to rewalk the page-table here instead of collating the
> >>>> number of block mappings we put down when creating the linear map in
> >>>> the first place?
> >>
> >> That's a good point; perhaps we can reuse the counters that this series introduces?
> >>
> >> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
> >>
> >>>
> >>> First, linear alias of the [_text, __init_begin) is not a target for
> >>> the split and it also seems strange to me to add code inside alloc_init_XXX()
> >>> that both checks an address range and counts to get the number of block mappings.
> >>>
> >>> Second, for a future feature,
> >>> I hope to add some code to split "specfic" area to be spilt e.x)
> >>> to set a specific pkey for specific area.
> >>
> >> Could you give more detail on this? My working assumption is that either the
> >> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
> >> linear map. If it does, we already have logic to split parts of the linear map
> >> when needed.
> >
> > This is not for a linear mapping case. but for a "kernel text area".
> > As a draft, I want to mark some of kernel code can executable
> > both kernel and eBPF program.
> > (I'm trying to make eBPF program non-executable kernel code directly
> > with POE feature).
> > For this "executable area" both of kernel and eBPF program
> > -- typical example is exception entry, It need to split that specific
> > range and mark them with special POE index.
>
> Ahh yes, I recall you mentioning this a while back (although I confess all the
> deatils have fallen out of my head). You'd need to make sure you're definitely
> not splitting an area of text that the secondary CPUs are executing while they
> are being held in the pen, since at least one of those CPUs doesn't support BBML2.
>
> >
> >>
> >>>
> >>> In this case, it's useful to rewalk the page-table with the specific
> >>> range to get the number of block mapping.
> >>>
> >>>>
> >>>>> + split_pgtables_idx = 0;
> >>>>> + split_pgtables_count = 0;
> >>>>> +
> >>>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> >>>>> + &collect_to_split_ops,
> >>>>> + NULL, NULL);
> >>>>> + if (!ret)
> >>>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
> >>>>> + &collect_to_split_ops,
> >>>>> + NULL, NULL);
> >>>>> + if (ret || !split_pgtables_count)
> >>>>> + goto error;
Just noticed this, but why do we check '!split_pgtables_count' here?
if the page-table is already somehow mapped at page granularity, that
doesn't necessarily sound like a fatal error to me.
> >>>>> +
> >>>>> + ret = -ENOMEM;
> >>>>> +
> >>>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
> >>>>> + GFP_KERNEL | __GFP_ZERO);
> >>>>> + if (!split_pgtables)
> >>>>> + goto error;
> >>>>> +
> >>>>> + for (i = 0; i < split_pgtables_count; i++) {
> >>>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
> >>>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
> >>>>> + if (!split_pgtables[i])
> >>>>> + goto error;
> >>>>
> >>>> This looks potentially expensive on the boot path and only gets worse as
> >>>> the amount of memory grows. Maybe we should predicate this preallocation
> >>>> on preempt-rt?
> >>>
> >>> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
> >>
> >> I guess I'm missing something obvious but I don't understand the problem here...
> >> We are only deferring the allocation of all these pgtables, so the cost is
> >> neutral surely? Had we correctly guessed that the system doesn't support BBML2
> >> earlier, we would have had to allocate all these pgtables earlier.
> >>
> >> Another way to look at it is that we are still allocating the same number of
> >> pgtables in the existing fallback path, it's just that we are doing it inside
> >> the stop_machine().
> >>
> >> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
> >> with significantly less testing...
> >
> > IIUC, Will's mention is additional memory allocation for
> > "split_pgtables" where saved "pre-allocate" page tables.
> > As the memory increase, definitely this size would increase the cost.
>
> Err, so you're referring to the extra kvmalloc()? I don't think that's a big
> deal is it? you get 512 pointers per page. So the amortized cost is 1/512= 0.2%?
Right, it was the page-table pages I was worried about not the array of
pointers.
> I suspect we have both misunderstood Will's point...
I probably just got confused by linear_map_free_split_pgtables() as it
has logic to free unused page-table pages between 'split_pgtables_idx'
and 'split_pgtables_count', implying that we can over-allocate.
If that is only needed for the error path in
linear_map_prealloc_split_pgtables(), then perhaps that part should be
inlined to deal with the case where we fail to allocate part way through.
Will
On 20/01/2026 15:53, Will Deacon wrote:
> On Tue, Jan 20, 2026 at 10:40:30AM +0000, Ryan Roberts wrote:
>> On 20/01/2026 09:29, Yeoreum Yun wrote:
>>> Hi Ryan
>>>> On 19/01/2026 21:24, Yeoreum Yun wrote:
>>>>> Hi Will,
>>>>>
>>>>>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
>>>>>>> +static int __init linear_map_prealloc_split_pgtables(void)
>>>>>>> +{
>>>>>>> + int ret, i;
>>>>>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
>>>>>>> + unsigned long lend = PAGE_END;
>>>>>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
>>>>>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
>>>>>>> +
>>>>>>> + const struct mm_walk_ops collect_to_split_ops = {
>>>>>>> + .pud_entry = collect_to_split_pud_entry,
>>>>>>> + .pmd_entry = collect_to_split_pmd_entry
>>>>>>> + };
>>>>>>
>>>>>> Why do we need to rewalk the page-table here instead of collating the
>>>>>> number of block mappings we put down when creating the linear map in
>>>>>> the first place?
>>>>
>>>> That's a good point; perhaps we can reuse the counters that this series introduces?
>>>>
>>>> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
>>>>
>>>>>
>>>>> First, linear alias of the [_text, __init_begin) is not a target for
>>>>> the split and it also seems strange to me to add code inside alloc_init_XXX()
>>>>> that both checks an address range and counts to get the number of block mappings.
>>>>>
>>>>> Second, for a future feature,
>>>>> I hope to add some code to split "specfic" area to be spilt e.x)
>>>>> to set a specific pkey for specific area.
>>>>
>>>> Could you give more detail on this? My working assumption is that either the
>>>> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
>>>> linear map. If it does, we already have logic to split parts of the linear map
>>>> when needed.
>>>
>>> This is not for a linear mapping case. but for a "kernel text area".
>>> As a draft, I want to mark some of kernel code can executable
>>> both kernel and eBPF program.
>>> (I'm trying to make eBPF program non-executable kernel code directly
>>> with POE feature).
>>> For this "executable area" both of kernel and eBPF program
>>> -- typical example is exception entry, It need to split that specific
>>> range and mark them with special POE index.
>>
>> Ahh yes, I recall you mentioning this a while back (although I confess all the
>> deatils have fallen out of my head). You'd need to make sure you're definitely
>> not splitting an area of text that the secondary CPUs are executing while they
>> are being held in the pen, since at least one of those CPUs doesn't support BBML2.
>>
>>>
>>>>
>>>>>
>>>>> In this case, it's useful to rewalk the page-table with the specific
>>>>> range to get the number of block mapping.
>>>>>
>>>>>>
>>>>>>> + split_pgtables_idx = 0;
>>>>>>> + split_pgtables_count = 0;
>>>>>>> +
>>>>>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
>>>>>>> + &collect_to_split_ops,
>>>>>>> + NULL, NULL);
>>>>>>> + if (!ret)
>>>>>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
>>>>>>> + &collect_to_split_ops,
>>>>>>> + NULL, NULL);
>>>>>>> + if (ret || !split_pgtables_count)
>>>>>>> + goto error;
>
> Just noticed this, but why do we check '!split_pgtables_count' here?
> if the page-table is already somehow mapped at page granularity, that
> doesn't necessarily sound like a fatal error to me.
>
>>>>>>> +
>>>>>>> + ret = -ENOMEM;
>>>>>>> +
>>>>>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
>>>>>>> + GFP_KERNEL | __GFP_ZERO);
>>>>>>> + if (!split_pgtables)
>>>>>>> + goto error;
>>>>>>> +
>>>>>>> + for (i = 0; i < split_pgtables_count; i++) {
>>>>>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
>>>>>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
>>>>>>> + if (!split_pgtables[i])
>>>>>>> + goto error;
>>>>>>
>>>>>> This looks potentially expensive on the boot path and only gets worse as
>>>>>> the amount of memory grows. Maybe we should predicate this preallocation
>>>>>> on preempt-rt?
>>>>>
>>>>> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
>>>>
>>>> I guess I'm missing something obvious but I don't understand the problem here...
>>>> We are only deferring the allocation of all these pgtables, so the cost is
>>>> neutral surely? Had we correctly guessed that the system doesn't support BBML2
>>>> earlier, we would have had to allocate all these pgtables earlier.
>>>>
>>>> Another way to look at it is that we are still allocating the same number of
>>>> pgtables in the existing fallback path, it's just that we are doing it inside
>>>> the stop_machine().
>>>>
>>>> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
>>>> with significantly less testing...
>>>
>>> IIUC, Will's mention is additional memory allocation for
>>> "split_pgtables" where saved "pre-allocate" page tables.
>>> As the memory increase, definitely this size would increase the cost.
>>
>> Err, so you're referring to the extra kvmalloc()? I don't think that's a big
>> deal is it? you get 512 pointers per page. So the amortized cost is 1/512= 0.2%?
>
> Right, it was the page-table pages I was worried about not the array of
> pointers.
>
>> I suspect we have both misunderstood Will's point...
>
> I probably just got confused by linear_map_free_split_pgtables() as it
> has logic to free unused page-table pages between 'split_pgtables_idx'
> and 'split_pgtables_count', implying that we can over-allocate.
>
> If that is only needed for the error path in
> linear_map_prealloc_split_pgtables(), then perhaps that part should be
> inlined to deal with the case where we fail to allocate part way through.
I was originally concerned [1] that there could be a race where another CPU
caused the normal splitting machinery to kick in after this cpu determined the
number of required page tables, so there could be some left over in that case.
On reflection, I guess (hope) that's not possible because we've determined that
some CPUs don't support BBML2. I'm guessing the secondaries haven't been
released to do general work yet?
In which case, I agree, this could be simplified and we could just assert that
all pre-allocated pages get used up if there is no error?
[1] https://lore.kernel.org/all/73ced1db-a2e2-49ea-927e-9fc4a30e771e@arm.com/
>
> Will
Hi Ryan,
> On 20/01/2026 15:53, Will Deacon wrote:
> > On Tue, Jan 20, 2026 at 10:40:30AM +0000, Ryan Roberts wrote:
> >> On 20/01/2026 09:29, Yeoreum Yun wrote:
> >>> Hi Ryan
> >>>> On 19/01/2026 21:24, Yeoreum Yun wrote:
> >>>>> Hi Will,
> >>>>>
> >>>>>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
> >>>>>>> +static int __init linear_map_prealloc_split_pgtables(void)
> >>>>>>> +{
> >>>>>>> + int ret, i;
> >>>>>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> >>>>>>> + unsigned long lend = PAGE_END;
> >>>>>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
> >>>>>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
> >>>>>>> +
> >>>>>>> + const struct mm_walk_ops collect_to_split_ops = {
> >>>>>>> + .pud_entry = collect_to_split_pud_entry,
> >>>>>>> + .pmd_entry = collect_to_split_pmd_entry
> >>>>>>> + };
> >>>>>>
> >>>>>> Why do we need to rewalk the page-table here instead of collating the
> >>>>>> number of block mappings we put down when creating the linear map in
> >>>>>> the first place?linear_map_maybe_split_to_ptes(
> >>>>
> >>>> That's a good point; perhaps we can reuse the counters that this series introduces?
> >>>>
> >>>> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
> >>>>
> >>>>>
> >>>>> First, linear alias of the [_text, __init_begin) is not a target for
> >>>>> the split and it also seems strange to me to add code inside alloc_init_XXX()
> >>>>> that both checks an address range and counts to get the number of block mappings.
> >>>>>
> >>>>> Second, for a future feature,
> >>>>> I hope to add some code to split "specfic" area to be spilt e.x)
> >>>>> to set a specific pkey for specific area.
> >>>>
> >>>> Could you give more detail on this? My working assumption is that either the
> >>>> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
> >>>> linear map. If it does, we already have logic to split parts of the linear map
> >>>> when needed.
> >>>
> >>> This is not for a linear mapping case. but for a "kernel text area".
> >>> As a draft, I want to mark some of kernel code can executable
> >>> both kernel and eBPF program.
> >>> (I'm trying to make eBPF program non-executable kernel code directly
> >>> with POE feature).
> >>> For this "executable area" both of kernel and eBPF program
> >>> -- typical example is exception entry, It need to split that specific
> >>> range and mark them with special POE index.
> >>
> >> Ahh yes, I recall you mentioning this a while back (although I confess all the
> >> deatils have fallen out of my head). You'd need to make sure you're definitely
> >> not splitting an area of text that the secondary CPUs are executing while they
> >> are being held in the pen, since at least one of those CPUs doesn't support BBML2.
> >>
> >>>
> >>>>
> >>>>>
> >>>>> In this case, it's useful to rewalk the page-table with the specific
> >>>>> range to get the number of block mapping.
> >>>>>
> >>>>>>
> >>>>>>> + split_pgtables_idx = 0;
> >>>>>>> + split_pgtables_count = 0;
> >>>>>>> +
> >>>>>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> >>>>>>> + &collect_to_split_ops,
> >>>>>>> + NULL, NULL);
> >>>>>>> + if (!ret)
> >>>>>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
> >>>>>>> + &collect_to_split_ops,
> >>>>>>> + NULL, NULL);
> >>>>>>> + if (ret || !split_pgtables_count)
> >>>>>>> + goto error;
> >
> > Just noticed this, but why do we check '!split_pgtables_count' here?
> > if the page-table is already somehow mapped at page granularity, that
> > doesn't necessarily sound like a fatal error to me.
> >
> >>>>>>> +
> >>>>>>> + ret = -ENOMEM;
> >>>>>>> +
> >>>>>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
> >>>>>>> + GFP_KERNEL | __GFP_ZERO);
> >>>>>>> + if (!split_pgtables)
> >>>>>>> + goto error;
> >>>>>>> +
> >>>>>>> + for (i = 0; i < split_pgtables_count; i++) {
> >>>>>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
> >>>>>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
> >>>>>>> + if (!split_pgtables[i])
> >>>>>>> + goto error;
> >>>>>>
> >>>>>> This looks potentially expensive on the boot path and only gets worse as
> >>>>>> the amount of memory grows. Maybe we should predicate this preallocation
> >>>>>> on preempt-rt?
> >>>>>
> >>>>> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
> >>>>
> >>>> I guess I'm missing something obvious but I don't understand the problem here...
> >>>> We are only deferring the allocation of all these pgtables, so the cost is
> >>>> neutral surely? Had we correctly guessed that the system doesn't support BBML2
> >>>> earlier, we would have had to allocate all these pgtables earlier.
> >>>>
> >>>> Another way to look at it is that we are still allocating the same number of
> >>>> pgtables in the existing fallback path, it's just that we are doing it inside
> >>>> the stop_machine().
> >>>>
> >>>> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
> >>>> with significantly less testing...
> >>>
> >>> IIUC, Will's mention is additional memory allocation for
> >>> "split_pgtables" where saved "pre-allocate" page tables.
> >>> As the memory increase, definitely this size would increase the cost.
> >>
> >> Err, so you're referring to the extra kvmalloc()? I don't think that's a big
> >> deal is it? you get 512 pointers per page. So the amortized cost is 1/512= 0.2%?
> >
> > Right, it was the page-table pages I was worried about not the array of
> > pointers.
> >
> >> I suspect we have both misunderstood Will's point...
> >
> > I probably just got confused by linear_map_free_split_pgtables() as it
> > has logic to free unused page-table pages between 'split_pgtables_idx'
> > and 'split_pgtables_count', implying that we can over-allocate.
> >
> > If that is only needed for the error path in
> > linear_map_prealloc_split_pgtables(), then perhaps that part should be
> > inlined to deal with the case where we fail to allocate part way through.
>
> I was originally concerned [1] that there could be a race where another CPU
> caused the normal splitting machinery to kick in after this cpu determined the
> number of required page tables, so there could be some left over in that case.
>
> On reflection, I guess (hope) that's not possible because we've determined that
> some CPUs don't support BBML2. I'm guessing the secondaries haven't been
> released to do general work yet?
I don't think so, since the linear_map_maybe_split_to_ptes() called
in smp_cpus_done() but in here, secondary cpus already on and
it seems schedulable.
That's why although, This is unlikely, after collecting the number of
splitiing by other cpu have a possibility to *split* which was counted
and at that time I agreed for your comments because of this *low
possiblity*.
>
> In which case, I agree, this could be simplified and we could just assert that
> all pre-allocated pages get used up if there is no error?
>
> [1] https://lore.kernel.org/all/73ced1db-a2e2-49ea-927e-9fc4a30e771e@arm.com/
So with above reason, I still think it need to sustain the free
unused pagetable.
Am I missing something?
--
Sincerely,
Yeoreum Yun
On 20/01/2026 16:31, Yeoreum Yun wrote:
> Hi Ryan,
>
>> On 20/01/2026 15:53, Will Deacon wrote:
>>> On Tue, Jan 20, 2026 at 10:40:30AM +0000, Ryan Roberts wrote:
>>>> On 20/01/2026 09:29, Yeoreum Yun wrote:
>>>>> Hi Ryan
>>>>>> On 19/01/2026 21:24, Yeoreum Yun wrote:
>>>>>>> Hi Will,
>>>>>>>
>>>>>>>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
>>>>>>>>> +static int __init linear_map_prealloc_split_pgtables(void)
>>>>>>>>> +{
>>>>>>>>> + int ret, i;
>>>>>>>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
>>>>>>>>> + unsigned long lend = PAGE_END;
>>>>>>>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
>>>>>>>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
>>>>>>>>> +
>>>>>>>>> + const struct mm_walk_ops collect_to_split_ops = {
>>>>>>>>> + .pud_entry = collect_to_split_pud_entry,
>>>>>>>>> + .pmd_entry = collect_to_split_pmd_entry
>>>>>>>>> + };
>>>>>>>>
>>>>>>>> Why do we need to rewalk the page-table here instead of collating the
>>>>>>>> number of block mappings we put down when creating the linear map in
>>>>>>>> the first place?linear_map_maybe_split_to_ptes(
>>>>>>
>>>>>> That's a good point; perhaps we can reuse the counters that this series introduces?
>>>>>>
>>>>>> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
>>>>>>
>>>>>>>
>>>>>>> First, linear alias of the [_text, __init_begin) is not a target for
>>>>>>> the split and it also seems strange to me to add code inside alloc_init_XXX()
>>>>>>> that both checks an address range and counts to get the number of block mappings.
>>>>>>>
>>>>>>> Second, for a future feature,
>>>>>>> I hope to add some code to split "specfic" area to be spilt e.x)
>>>>>>> to set a specific pkey for specific area.
>>>>>>
>>>>>> Could you give more detail on this? My working assumption is that either the
>>>>>> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
>>>>>> linear map. If it does, we already have logic to split parts of the linear map
>>>>>> when needed.
>>>>>
>>>>> This is not for a linear mapping case. but for a "kernel text area".
>>>>> As a draft, I want to mark some of kernel code can executable
>>>>> both kernel and eBPF program.
>>>>> (I'm trying to make eBPF program non-executable kernel code directly
>>>>> with POE feature).
>>>>> For this "executable area" both of kernel and eBPF program
>>>>> -- typical example is exception entry, It need to split that specific
>>>>> range and mark them with special POE index.
>>>>
>>>> Ahh yes, I recall you mentioning this a while back (although I confess all the
>>>> deatils have fallen out of my head). You'd need to make sure you're definitely
>>>> not splitting an area of text that the secondary CPUs are executing while they
>>>> are being held in the pen, since at least one of those CPUs doesn't support BBML2.
>>>>
>>>>>
>>>>>>
>>>>>>>
>>>>>>> In this case, it's useful to rewalk the page-table with the specific
>>>>>>> range to get the number of block mapping.
>>>>>>>
>>>>>>>>
>>>>>>>>> + split_pgtables_idx = 0;
>>>>>>>>> + split_pgtables_count = 0;
>>>>>>>>> +
>>>>>>>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
>>>>>>>>> + &collect_to_split_ops,
>>>>>>>>> + NULL, NULL);
>>>>>>>>> + if (!ret)
>>>>>>>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
>>>>>>>>> + &collect_to_split_ops,
>>>>>>>>> + NULL, NULL);
>>>>>>>>> + if (ret || !split_pgtables_count)
>>>>>>>>> + goto error;
>>>
>>> Just noticed this, but why do we check '!split_pgtables_count' here?
>>> if the page-table is already somehow mapped at page granularity, that
>>> doesn't necessarily sound like a fatal error to me.
>>>
>>>>>>>>> +
>>>>>>>>> + ret = -ENOMEM;
>>>>>>>>> +
>>>>>>>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
>>>>>>>>> + GFP_KERNEL | __GFP_ZERO);
>>>>>>>>> + if (!split_pgtables)
>>>>>>>>> + goto error;
>>>>>>>>> +
>>>>>>>>> + for (i = 0; i < split_pgtables_count; i++) {
>>>>>>>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
>>>>>>>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
>>>>>>>>> + if (!split_pgtables[i])
>>>>>>>>> + goto error;
>>>>>>>>
>>>>>>>> This looks potentially expensive on the boot path and only gets worse as
>>>>>>>> the amount of memory grows. Maybe we should predicate this preallocation
>>>>>>>> on preempt-rt?
>>>>>>>
>>>>>>> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
>>>>>>
>>>>>> I guess I'm missing something obvious but I don't understand the problem here...
>>>>>> We are only deferring the allocation of all these pgtables, so the cost is
>>>>>> neutral surely? Had we correctly guessed that the system doesn't support BBML2
>>>>>> earlier, we would have had to allocate all these pgtables earlier.
>>>>>>
>>>>>> Another way to look at it is that we are still allocating the same number of
>>>>>> pgtables in the existing fallback path, it's just that we are doing it inside
>>>>>> the stop_machine().
>>>>>>
>>>>>> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
>>>>>> with significantly less testing...
>>>>>
>>>>> IIUC, Will's mention is additional memory allocation for
>>>>> "split_pgtables" where saved "pre-allocate" page tables.
>>>>> As the memory increase, definitely this size would increase the cost.
>>>>
>>>> Err, so you're referring to the extra kvmalloc()? I don't think that's a big
>>>> deal is it? you get 512 pointers per page. So the amortized cost is 1/512= 0.2%?
>>>
>>> Right, it was the page-table pages I was worried about not the array of
>>> pointers.
>>>
>>>> I suspect we have both misunderstood Will's point...
>>>
>>> I probably just got confused by linear_map_free_split_pgtables() as it
>>> has logic to free unused page-table pages between 'split_pgtables_idx'
>>> and 'split_pgtables_count', implying that we can over-allocate.
>>>
>>> If that is only needed for the error path in
>>> linear_map_prealloc_split_pgtables(), then perhaps that part should be
>>> inlined to deal with the case where we fail to allocate part way through.
>>
>> I was originally concerned [1] that there could be a race where another CPU
>> caused the normal splitting machinery to kick in after this cpu determined the
>> number of required page tables, so there could be some left over in that case.
>>
>> On reflection, I guess (hope) that's not possible because we've determined that
>> some CPUs don't support BBML2. I'm guessing the secondaries haven't been
>> released to do general work yet?
>
> I don't think so, since the linear_map_maybe_split_to_ptes() called
> in smp_cpus_done() but in here, secondary cpus already on and
> it seems schedulable.
>
> That's why although, This is unlikely, after collecting the number of
> splitiing by other cpu have a possibility to *split* which was counted
> and at that time I agreed for your comments because of this *low
> possiblity*.
>
>>
>> In which case, I agree, this could be simplified and we could just assert that
>> all pre-allocated pages get used up if there is no error?
>>
>> [1] https://lore.kernel.org/all/73ced1db-a2e2-49ea-927e-9fc4a30e771e@arm.com/
>
> So with above reason, I still think it need to sustain the free
> unused pagetable.
>
> Am I missing something?
My concern is that if a secondary CPU can race and cause a split, that is
unsound because we have determined that although the primary CPU supports BBML2,
at least one of the secondary CPUs does not. So splitting a live mapping is unsafe.
I just had a brief chat with Rutland, and he agrees that this _could_ be a
problem. Basically there is a window between onlining the secondary cpus and
entering the stop_machine() where one of those cpus _could_ end up doing
something that causes us to split the linear map.
I'm not immediately sure how to solve that.
>
> --
> Sincerely,
> Yeoreum Yun
>> My concern is that if a secondary CPU can race and cause a split, that is >> unsound because we have determined that although the primary CPU supports BBML2, >> at least one of the secondary CPUs does not. So splitting a live mapping is unsafe. >> >> I just had a brief chat with Rutland, and he agrees that this _could_ be a >> problem. Basically there is a window between onlining the secondary cpus and >> entering the stop_machine() where one of those cpus _could_ end up doing >> something that causes us to split the linear map. If I remember correctly, split_kernel_leaf_mapping() does call system_supports_bbml2_noabort() before doing real split. So we basically should fall into two categories: 1. bbml2_noabort is supported on all cpus. Everything is fine. 2. bbml2_noabort is not supported on all cpus. split_kernel_leaf_mapping() just returns 0. Kernel doesn't split page table, so there won't be TLB conflict issue. But the following page prot update may see unexpected block mapping, then a WARN will be raised and it will return -EINVAL. So the worst case is the caller will fail (IIRC all the callers of set_memory_*() handle the failure), and we can know who is trying to change linear mapping before the linear mapping gets finalized. AFAICT I haven't seen such WARN yet. >> >> I'm not immediately sure how to solve that. Do we need some synchronization mechanism? If the linear mapping is not finalized yet, split_kernel_leaf_mapping() will spin. For example, something like this off the top of my head, DEFINE_STATIC_KEY_FALSE(linear_mapping_finalized); Once the linear mapping is finalized, we can call static_branch_enable(&linear_mapping_finalized); In split_kernel_leaf_mapping(), we can just do: retry: if (!static_branch_likely(&linear_mapping_finalized)) goto retry; There may be better way to handle it. But this case should be very unlikely IMHO. It sounds crazy to have such complicated kernel threads run so early. I'm not sure whether we should pay immediate attention to it or not. Thanks, Yang >> >> -- >> Sincerely, >> Yeoreum Yun
> > > My concern is that if a secondary CPU can race and cause a split, that is > > > unsound because we have determined that although the primary CPU supports BBML2, > > > at least one of the secondary CPUs does not. So splitting a live mapping is unsafe. > > > > > > I just had a brief chat with Rutland, and he agrees that this _could_ be a > > > problem. Basically there is a window between onlining the secondary cpus and > > > entering the stop_machine() where one of those cpus _could_ end up doing > > > something that causes us to split the linear map. > > If I remember correctly, split_kernel_leaf_mapping() does call > system_supports_bbml2_noabort() before doing real split. So we basically > should fall into two categories: > > 1. bbml2_noabort is supported on all cpus. Everything is fine. > 2. bbml2_noabort is not supported on all cpus. split_kernel_leaf_mapping() > just returns 0. Kernel doesn't split page table, so there won't be TLB > conflict issue. But the following page prot update may see unexpected block > mapping, then a WARN will be raised and it will return -EINVAL. So the > worst case is the caller will fail (IIRC all the callers of set_memory_*() > handle the failure), and we can know who is trying to change linear mapping > before the linear mapping gets finalized. AFAICT I > haven't seen such WARN yet. Thanks for the great detail :) I've missed system_supports_bbml2_noabort() in split_kernel_leaf_mapping(). > > > > > > > I'm not immediately sure how to solve that. > > Do we need some synchronization mechanism? If the linear mapping is not > finalized yet, split_kernel_leaf_mapping() will spin. For example, something > like this off the top of my head, > > DEFINE_STATIC_KEY_FALSE(linear_mapping_finalized); > > Once the linear mapping is finalized, we can call > static_branch_enable(&linear_mapping_finalized); > > In split_kernel_leaf_mapping(), we can just do: > > retry: > if (!static_branch_likely(&linear_mapping_finalized)) > goto retry; > > > There may be better way to handle it. But this case should be very unlikely > IMHO. It sounds crazy to have such complicated kernel threads run so early. > I'm not sure whether we should pay immediate attention to it or not. Thinking about it again, I’m not sure whether it is acceptable to use a sleepable synchronization mechanism at this stage, like split_kernel_leaf_mapping() does with mutex_lock() (even though it may be technically possible). It also feels odd that this function can be called at this point in time. If this is indeed considered problematic, I think it would be better to simply return -EINVAL immediately when linear_mapping_finalized has not yet been completed. -- Sincerely, Yeoreum Yun
On 21/01/2026 08:32, Yeoreum Yun wrote: >>>> My concern is that if a secondary CPU can race and cause a split, that is >>>> unsound because we have determined that although the primary CPU supports BBML2, >>>> at least one of the secondary CPUs does not. So splitting a live mapping is unsafe. >>>> >>>> I just had a brief chat with Rutland, and he agrees that this _could_ be a >>>> problem. Basically there is a window between onlining the secondary cpus and >>>> entering the stop_machine() where one of those cpus _could_ end up doing >>>> something that causes us to split the linear map. >> >> If I remember correctly, split_kernel_leaf_mapping() does call >> system_supports_bbml2_noabort() before doing real split. So we basically >> should fall into two categories: >> >> 1. bbml2_noabort is supported on all cpus. Everything is fine. >> 2. bbml2_noabort is not supported on all cpus. split_kernel_leaf_mapping() >> just returns 0. Kernel doesn't split page table, so there won't be TLB >> conflict issue. But the following page prot update may see unexpected block >> mapping, then a WARN will be raised and it will return -EINVAL. So the >> worst case is the caller will fail (IIRC all the callers of set_memory_*() >> handle the failure), and we can know who is trying to change linear mapping >> before the linear mapping gets finalized. AFAICT I >> haven't seen such WARN yet. Ahh good point! So this isn't quite as terrible as I was thinking. > > Thanks for the great detail :) > I've missed system_supports_bbml2_noabort() in split_kernel_leaf_mapping(). > >> >>>> >>>> I'm not immediately sure how to solve that. >> >> Do we need some synchronization mechanism? If the linear mapping is not >> finalized yet, split_kernel_leaf_mapping() will spin. For example, something >> like this off the top of my head, >> >> DEFINE_STATIC_KEY_FALSE(linear_mapping_finalized); >> >> Once the linear mapping is finalized, we can call >> static_branch_enable(&linear_mapping_finalized); >> >> In split_kernel_leaf_mapping(), we can just do: >> >> retry: >> if (!static_branch_likely(&linear_mapping_finalized)) >> goto retry; >> Yuck... But I guess it might work as long as the primary thread never does anything that would cause an attempt to split; otherwise we have a deadlock. >> >> There may be better way to handle it. But this case should be very unlikely >> IMHO. It sounds crazy to have such complicated kernel threads run so early. >> I'm not sure whether we should pay immediate attention to it or not. I think we need to figure out if this is actually possible. We bring up the secondary cpus, set system caps and finalize the linear map in smp_init(). That's called from kernel_init_freeable() which is called from kernel_init(), which is invoked as a thread pinned to the boot cpu. sched_init_smp() is called after smp_init() (i.e. after the linear map is finalized). I'm guessing (based on the name of sched_init_smp()) that nothing other than the idle thread will run on any secondaries until after sched_init_smp() is called? (I'd be greatful if anyone can confirm that). Rutland suggested that it's probably too early for any PM type stuff to be running in the idle loop, so based on all of that, perhaps this is not a problem after all and there is basically zero chance of a secondary cpu doing anything that could cause a linear map split during this window? I'm inclined to leave this as is for now. Thanks, Ryan > > Thinking about it again, I’m not sure whether > it is acceptable to use a sleepable synchronization mechanism at this stage, > like split_kernel_leaf_mapping() does with mutex_lock() > (even though it may be technically possible). > It also feels odd that this function can be called at this point in time. > > If this is indeed considered problematic, > I think it would be better to simply return -EINVAL immediately > when linear_mapping_finalized has not yet been completed. > > -- > Sincerely, > Yeoreum Yun
On 1/21/26 2:20 AM, Ryan Roberts wrote: > On 21/01/2026 08:32, Yeoreum Yun wrote: >>>>> My concern is that if a secondary CPU can race and cause a split, that is >>>>> unsound because we have determined that although the primary CPU supports BBML2, >>>>> at least one of the secondary CPUs does not. So splitting a live mapping is unsafe. >>>>> >>>>> I just had a brief chat with Rutland, and he agrees that this _could_ be a >>>>> problem. Basically there is a window between onlining the secondary cpus and >>>>> entering the stop_machine() where one of those cpus _could_ end up doing >>>>> something that causes us to split the linear map. >>> If I remember correctly, split_kernel_leaf_mapping() does call >>> system_supports_bbml2_noabort() before doing real split. So we basically >>> should fall into two categories: >>> >>> 1. bbml2_noabort is supported on all cpus. Everything is fine. >>> 2. bbml2_noabort is not supported on all cpus. split_kernel_leaf_mapping() >>> just returns 0. Kernel doesn't split page table, so there won't be TLB >>> conflict issue. But the following page prot update may see unexpected block >>> mapping, then a WARN will be raised and it will return -EINVAL. So the >>> worst case is the caller will fail (IIRC all the callers of set_memory_*() >>> handle the failure), and we can know who is trying to change linear mapping >>> before the linear mapping gets finalized. AFAICT I >>> haven't seen such WARN yet. > Ahh good point! So this isn't quite as terrible as I was thinking. Yeah. > >> Thanks for the great detail :) >> I've missed system_supports_bbml2_noabort() in split_kernel_leaf_mapping(). >> >>>>> I'm not immediately sure how to solve that. >>> Do we need some synchronization mechanism? If the linear mapping is not >>> finalized yet, split_kernel_leaf_mapping() will spin. For example, something >>> like this off the top of my head, >>> >>> DEFINE_STATIC_KEY_FALSE(linear_mapping_finalized); >>> >>> Once the linear mapping is finalized, we can call >>> static_branch_enable(&linear_mapping_finalized); >>> >>> In split_kernel_leaf_mapping(), we can just do: >>> >>> retry: >>> if (!static_branch_likely(&linear_mapping_finalized)) >>> goto retry; >>> > Yuck... But I guess it might work as long as the primary thread never does > anything that would cause an attempt to split; otherwise we have a deadlock. > >>> There may be better way to handle it. But this case should be very unlikely >>> IMHO. It sounds crazy to have such complicated kernel threads run so early. >>> I'm not sure whether we should pay immediate attention to it or not. > I think we need to figure out if this is actually possible. We bring up the > secondary cpus, set system caps and finalize the linear map in smp_init(). > That's called from kernel_init_freeable() which is called from kernel_init(), > which is invoked as a thread pinned to the boot cpu. > > sched_init_smp() is called after smp_init() (i.e. after the linear map is > finalized). I'm guessing (based on the name of sched_init_smp()) that nothing > other than the idle thread will run on any secondaries until after > sched_init_smp() is called? (I'd be greatful if anyone can confirm that). > > Rutland suggested that it's probably too early for any PM type stuff to be > running in the idle loop, so based on all of that, perhaps this is not a problem > after all and there is basically zero chance of a secondary cpu doing anything > that could cause a linear map split during this window? > > I'm inclined to leave this as is for now. I agree. I don't think this would be a real problem. Thanks, Yang > > Thanks, > Ryan > >> Thinking about it again, I’m not sure whether >> it is acceptable to use a sleepable synchronization mechanism at this stage, >> like split_kernel_leaf_mapping() does with mutex_lock() >> (even though it may be technically possible). >> It also feels odd that this function can be called at this point in time. >> >> If this is indeed considered problematic, >> I think it would be better to simply return -EINVAL immediately >> when linear_mapping_finalized has not yet been completed. >> >> -- >> Sincerely, >> Yeoreum Yun
On Wed, Jan 21, 2026 at 02:57:28PM -0800, Yang Shi wrote: > > > On 1/21/26 2:20 AM, Ryan Roberts wrote: > > On 21/01/2026 08:32, Yeoreum Yun wrote: > > > > > > My concern is that if a secondary CPU can race and cause a split, that is > > > > > > unsound because we have determined that although the primary CPU supports BBML2, > > > > > > at least one of the secondary CPUs does not. So splitting a live mapping is unsafe. > > > > > > > > > > > > I just had a brief chat with Rutland, and he agrees that this _could_ be a > > > > > > problem. Basically there is a window between onlining the secondary cpus and > > > > > > entering the stop_machine() where one of those cpus _could_ end up doing > > > > > > something that causes us to split the linear map. > > > > If I remember correctly, split_kernel_leaf_mapping() does call > > > > system_supports_bbml2_noabort() before doing real split. So we basically > > > > should fall into two categories: > > > > > > > > 1. bbml2_noabort is supported on all cpus. Everything is fine. > > > > 2. bbml2_noabort is not supported on all cpus. split_kernel_leaf_mapping() > > > > just returns 0. Kernel doesn't split page table, so there won't be TLB > > > > conflict issue. But the following page prot update may see unexpected block > > > > mapping, then a WARN will be raised and it will return -EINVAL. So the > > > > worst case is the caller will fail (IIRC all the callers of set_memory_*() > > > > handle the failure), and we can know who is trying to change linear mapping > > > > before the linear mapping gets finalized. AFAICT I > > > > haven't seen such WARN yet. > > Ahh good point! So this isn't quite as terrible as I was thinking. > > Yeah. > > > > > > Thanks for the great detail :) > > > I've missed system_supports_bbml2_noabort() in split_kernel_leaf_mapping(). > > > > > > > > > I'm not immediately sure how to solve that. > > > > Do we need some synchronization mechanism? If the linear mapping is not > > > > finalized yet, split_kernel_leaf_mapping() will spin. For example, something > > > > like this off the top of my head, > > > > > > > > DEFINE_STATIC_KEY_FALSE(linear_mapping_finalized); > > > > > > > > Once the linear mapping is finalized, we can call > > > > static_branch_enable(&linear_mapping_finalized); > > > > > > > > In split_kernel_leaf_mapping(), we can just do: > > > > > > > > retry: > > > > if (!static_branch_likely(&linear_mapping_finalized)) > > > > goto retry; > > > > > > Yuck... But I guess it might work as long as the primary thread never does > > anything that would cause an attempt to split; otherwise we have a deadlock. > > > > > > There may be better way to handle it. But this case should be very unlikely > > > > IMHO. It sounds crazy to have such complicated kernel threads run so early. > > > > I'm not sure whether we should pay immediate attention to it or not. > > I think we need to figure out if this is actually possible. We bring up the > > secondary cpus, set system caps and finalize the linear map in smp_init(). > > That's called from kernel_init_freeable() which is called from kernel_init(), > > which is invoked as a thread pinned to the boot cpu. > > > > sched_init_smp() is called after smp_init() (i.e. after the linear map is > > finalized). I'm guessing (based on the name of sched_init_smp()) that nothing > > other than the idle thread will run on any secondaries until after > > sched_init_smp() is called? (I'd be greatful if anyone can confirm that). > > > > Rutland suggested that it's probably too early for any PM type stuff to be > > running in the idle loop, so based on all of that, perhaps this is not a problem > > after all and there is basically zero chance of a secondary cpu doing anything > > that could cause a linear map split during this window? > > > > I'm inclined to leave this as is for now. > > I agree. I don't think this would be a real problem. > > Thanks, > Yang Although partially using GFP_ATOMIC might not be an issue given that there is no contention at the moment, technically using a memory allocation API inside stop_machine() is problematic for PREEMPT_RT, and the relevant page tables should be pre-allocated. That said, taking a step back (I’m not sure why I was being so stubborn about this), since the kernel_alias area is mapped using block mappings, a simple calculation based on your dm_meminfo patch should be sufficient to determine the number of page tables that need to be pre-allocated for splitting the linear mapping, without having to walk the page tables again. So, after your dm_meminfo patch, I plan to respin this patch based on that. Am I missing anything? -- Sincerely, Yeoreum Yun
>>> I'm inclined to leave this as is for now. >> >> I agree. I don't think this would be a real problem. >> >> Thanks, >> Yang > > Although partially using GFP_ATOMIC might not be an issue given that > there is no contention at the moment, > technically using a memory allocation API inside stop_machine() is problematic > for PREEMPT_RT, and the relevant page tables should be pre-allocated. > > That said, taking a step back (I’m not sure why I was being so stubborn about this), > since the kernel_alias area is mapped using block mappings, > a simple calculation based on your dm_meminfo patch should be sufficient to > determine the number of page tables that need to be pre-allocated for > splitting the linear mapping, without having to walk the page tables again. > > So, after your dm_meminfo patch, I plan to respin this patch based on that. > > Am I missing anything? Yeoreum and I had an offline chat about all this and I just want to spell out the conclusion for everybody; We believe it's impossible for the secondary CPUs to have any scheduled work during this window, and therefore it's impossible for the split race to occur, and it's impossible for there to be any contention on the page allocator lock. All of which means that in practice, there are no bugs here and the code is safe as is, even for PREEMPT_RT. That said, allocating memory inside of stop_machine() is a bad smell, so Yeoreum will rectify this down the track; it's not urgent though. Thanks, Ryan
On Wed, Jan 21, 2026 at 10:20:52AM +0000, Ryan Roberts wrote: > On 21/01/2026 08:32, Yeoreum Yun wrote: > >>>> My concern is that if a secondary CPU can race and cause a split, that is > >>>> unsound because we have determined that although the primary CPU supports BBML2, > >>>> at least one of the secondary CPUs does not. So splitting a live mapping is unsafe. > >>>> > >>>> I just had a brief chat with Rutland, and he agrees that this _could_ be a > >>>> problem. Basically there is a window between onlining the secondary cpus and > >>>> entering the stop_machine() where one of those cpus _could_ end up doing > >>>> something that causes us to split the linear map. > >> > >> If I remember correctly, split_kernel_leaf_mapping() does call > >> system_supports_bbml2_noabort() before doing real split. So we basically > >> should fall into two categories: > >> > >> 1. bbml2_noabort is supported on all cpus. Everything is fine. > >> 2. bbml2_noabort is not supported on all cpus. split_kernel_leaf_mapping() > >> just returns 0. Kernel doesn't split page table, so there won't be TLB > >> conflict issue. But the following page prot update may see unexpected block > >> mapping, then a WARN will be raised and it will return -EINVAL. So the > >> worst case is the caller will fail (IIRC all the callers of set_memory_*() > >> handle the failure), and we can know who is trying to change linear mapping > >> before the linear mapping gets finalized. AFAICT I > >> haven't seen such WARN yet. > > Ahh good point! So this isn't quite as terrible as I was thinking. > > > > > Thanks for the great detail :) > > I've missed system_supports_bbml2_noabort() in split_kernel_leaf_mapping(). > > > >> > >>>> > >>>> I'm not immediately sure how to solve that. > >> > >> Do we need some synchronization mechanism? If the linear mapping is not > >> finalized yet, split_kernel_leaf_mapping() will spin. For example, something > >> like this off the top of my head, > >> > >> DEFINE_STATIC_KEY_FALSE(linear_mapping_finalized); > >> > >> Once the linear mapping is finalized, we can call > >> static_branch_enable(&linear_mapping_finalized); > >> > >> In split_kernel_leaf_mapping(), we can just do: > >> > >> retry: > >> if (!static_branch_likely(&linear_mapping_finalized)) > >> goto retry; > >> > > Yuck... But I guess it might work as long as the primary thread never does > anything that would cause an attempt to split; otherwise we have a deadlock. > > >> > >> There may be better way to handle it. But this case should be very unlikely > >> IMHO. It sounds crazy to have such complicated kernel threads run so early. > >> I'm not sure whether we should pay immediate attention to it or not. > > I think we need to figure out if this is actually possible. We bring up the > secondary cpus, set system caps and finalize the linear map in smp_init(). > That's called from kernel_init_freeable() which is called from kernel_init(), > which is invoked as a thread pinned to the boot cpu. > > sched_init_smp() is called after smp_init() (i.e. after the linear map is > finalized). I'm guessing (based on the name of sched_init_smp()) that nothing > other than the idle thread will run on any secondaries until after > sched_init_smp() is called? (I'd be greatful if anyone can confirm that). > > Rutland suggested that it's probably too early for any PM type stuff to be > running in the idle loop, so based on all of that, perhaps this is not a problem > after all and there is basically zero chance of a secondary cpu doing anything > that could cause a linear map split during this window? > > I'm inclined to leave this as is for now. IIUC, since it before the sched_domain creation yet, the wake_up_new_task()'s cpu selection via select_task_rq_fair() would be always the same for prev_cpu. So I agree with you this case would be a zero chance of a scondary cpu doing anything. > > Thanks, > Ryan > > > > > Thinking about it again, I’m not sure whether > > it is acceptable to use a sleepable synchronization mechanism at this stage, > > like split_kernel_leaf_mapping() does with mutex_lock() > > (even though it may be technically possible). > > It also feels odd that this function can be called at this point in time. > > > > If this is indeed considered problematic, > > I think it would be better to simply return -EINVAL immediately > > when linear_mapping_finalized has not yet been completed. > > > > -- > > Sincerely, > > Yeoreum Yun > -- Sincerely, Yeoreum Yun
On Tue, Jan 20, 2026 at 05:35:50PM +0000, Ryan Roberts wrote:
> On 20/01/2026 16:31, Yeoreum Yun wrote:
> > Hi Ryan,
> >
> >> On 20/01/2026 15:53, Will Deacon wrote:
> >>> On Tue, Jan 20, 2026 at 10:40:30AM +0000, Ryan Roberts wrote:
> >>>> On 20/01/2026 09:29, Yeoreum Yun wrote:
> >>>>> Hi Ryan
> >>>>>> On 19/01/2026 21:24, Yeoreum Yun wrote:
> >>>>>>> Hi Will,
> >>>>>>>
> >>>>>>>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
> >>>>>>>>> +static int __init linear_map_prealloc_split_pgtables(void)
> >>>>>>>>> +{
> >>>>>>>>> + int ret, i;
> >>>>>>>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> >>>>>>>>> + unsigned long lend = PAGE_END;
> >>>>>>>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
> >>>>>>>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
> >>>>>>>>> +
> >>>>>>>>> + const struct mm_walk_ops collect_to_split_ops = {
> >>>>>>>>> + .pud_entry = collect_to_split_pud_entry,
> >>>>>>>>> + .pmd_entry = collect_to_split_pmd_entry
> >>>>>>>>> + };
> >>>>>>>>
> >>>>>>>> Why do we need to rewalk the page-table here instead of collating the
> >>>>>>>> number of block mappings we put down when creating the linear map in
> >>>>>>>> the first place?linear_map_maybe_split_to_ptes(
> >>>>>>
> >>>>>> That's a good point; perhaps we can reuse the counters that this series introduces?
> >>>>>>
> >>>>>> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
> >>>>>>
> >>>>>>>
> >>>>>>> First, linear alias of the [_text, __init_begin) is not a target for
> >>>>>>> the split and it also seems strange to me to add code inside alloc_init_XXX()
> >>>>>>> that both checks an address range and counts to get the number of block mappings.
> >>>>>>>
> >>>>>>> Second, for a future feature,
> >>>>>>> I hope to add some code to split "specfic" area to be spilt e.x)
> >>>>>>> to set a specific pkey for specific area.
> >>>>>>
> >>>>>> Could you give more detail on this? My working assumption is that either the
> >>>>>> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
> >>>>>> linear map. If it does, we already have logic to split parts of the linear map
> >>>>>> when needed.
> >>>>>
> >>>>> This is not for a linear mapping case. but for a "kernel text area".
> >>>>> As a draft, I want to mark some of kernel code can executable
> >>>>> both kernel and eBPF program.
> >>>>> (I'm trying to make eBPF program non-executable kernel code directly
> >>>>> with POE feature).
> >>>>> For this "executable area" both of kernel and eBPF program
> >>>>> -- typical example is exception entry, It need to split that specific
> >>>>> range and mark them with special POE index.
> >>>>
> >>>> Ahh yes, I recall you mentioning this a while back (although I confess all the
> >>>> deatils have fallen out of my head). You'd need to make sure you're definitely
> >>>> not splitting an area of text that the secondary CPUs are executing while they
> >>>> are being held in the pen, since at least one of those CPUs doesn't support BBML2.
> >>>>
> >>>>>
> >>>>>>
> >>>>>>>
> >>>>>>> In this case, it's useful to rewalk the page-table with the specific
> >>>>>>> range to get the number of block mapping.
> >>>>>>>
> >>>>>>>>
> >>>>>>>>> + split_pgtables_idx = 0;
> >>>>>>>>> + split_pgtables_count = 0;
> >>>>>>>>> +
> >>>>>>>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> >>>>>>>>> + &collect_to_split_ops,
> >>>>>>>>> + NULL, NULL);
> >>>>>>>>> + if (!ret)
> >>>>>>>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
> >>>>>>>>> + &collect_to_split_ops,
> >>>>>>>>> + NULL, NULL);
> >>>>>>>>> + if (ret || !split_pgtables_count)
> >>>>>>>>> + goto error;
> >>>
> >>> Just noticed this, but why do we check '!split_pgtables_count' here?
> >>> if the page-table is already somehow mapped at page granularity, that
> >>> doesn't necessarily sound like a fatal error to me.
> >>>
> >>>>>>>>> +
> >>>>>>>>> + ret = -ENOMEM;
> >>>>>>>>> +
> >>>>>>>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
> >>>>>>>>> + GFP_KERNEL | __GFP_ZERO);
> >>>>>>>>> + if (!split_pgtables)
> >>>>>>>>> + goto error;
> >>>>>>>>> +
> >>>>>>>>> + for (i = 0; i < split_pgtables_count; i++) {
> >>>>>>>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
> >>>>>>>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
> >>>>>>>>> + if (!split_pgtables[i])
> >>>>>>>>> + goto error;
> >>>>>>>>
> >>>>>>>> This looks potentially expensive on the boot path and only gets worse as
> >>>>>>>> the amount of memory grows. Maybe we should predicate this preallocation
> >>>>>>>> on preempt-rt?
> >>>>>>>
> >>>>>>> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
> >>>>>>
> >>>>>> I guess I'm missing something obvious but I don't understand the problem here...
> >>>>>> We are only deferring the allocation of all these pgtables, so the cost is
> >>>>>> neutral surely? Had we correctly guessed that the system doesn't support BBML2
> >>>>>> earlier, we would have had to allocate all these pgtables earlier.
> >>>>>>
> >>>>>> Another way to look at it is that we are still allocating the same number of
> >>>>>> pgtables in the existing fallback path, it's just that we are doing it inside
> >>>>>> the stop_machine().
> >>>>>>
> >>>>>> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
> >>>>>> with significantly less testing...
> >>>>>
> >>>>> IIUC, Will's mention is additional memory allocation for
> >>>>> "split_pgtables" where saved "pre-allocate" page tables.
> >>>>> As the memory increase, definitely this size would increase the cost.
> >>>>
> >>>> Err, so you're referring to the extra kvmalloc()? I don't think that's a big
> >>>> deal is it? you get 512 pointers per page. So the amortized cost is 1/512= 0.2%?
> >>>
> >>> Right, it was the page-table pages I was worried about not the array of
> >>> pointers.
> >>>
> >>>> I suspect we have both misunderstood Will's point...
> >>>
> >>> I probably just got confused by linear_map_free_split_pgtables() as it
> >>> has logic to free unused page-table pages between 'split_pgtables_idx'
> >>> and 'split_pgtables_count', implying that we can over-allocate.
> >>>
> >>> If that is only needed for the error path in
> >>> linear_map_prealloc_split_pgtables(), then perhaps that part should be
> >>> inlined to deal with the case where we fail to allocate part way through.
> >>
> >> I was originally concerned [1] that there could be a race where another CPU
> >> caused the normal splitting machinery to kick in after this cpu determined the
> >> number of required page tables, so there could be some left over in that case.
> >>
> >> On reflection, I guess (hope) that's not possible because we've determined that
> >> some CPUs don't support BBML2. I'm guessing the secondaries haven't been
> >> released to do general work yet?
> >
> > I don't think so, since the linear_map_maybe_split_to_ptes() called
> > in smp_cpus_done() but in here, secondary cpus already on and
> > it seems schedulable.
> >
> > That's why although, This is unlikely, after collecting the number of
> > splitiing by other cpu have a possibility to *split* which was counted
> > and at that time I agreed for your comments because of this *low
> > possiblity*.
> >
> >>
> >> In which case, I agree, this could be simplified and we could just assert that
> >> all pre-allocated pages get used up if there is no error?
> >>
> >> [1] https://lore.kernel.org/all/73ced1db-a2e2-49ea-927e-9fc4a30e771e@arm.com/
> >
> > So with above reason, I still think it need to sustain the free
> > unused pagetable.
> >
> > Am I missing something?
>
> My concern is that if a secondary CPU can race and cause a split, that is
> unsound because we have determined that although the primary CPU supports BBML2,
> at least one of the secondary CPUs does not. So splitting a live mapping is unsafe.
>
> I just had a brief chat with Rutland, and he agrees that this _could_ be a
> problem. Basically there is a window between onlining the secondary cpus and
> entering the stop_machine() where one of those cpus _could_ end up doing
> something that causes us to split the linear map.
Regardless of BBML2, does it means it would be a problem call
set_memory_xxx() API during this windows?
For example,
CPU0 (boot) CPU1 (secondary)
linear_map_maybe_split_to_ptes()
collect the number of pagetable
set_memory_xxx()
split the specific linear region
preallocate() and spliting().
TBH, I'm not sure why this scenario could be a problem?
> > --
> > Sincerely,
> > Yeoreum Yun
>
--
Sincerely,
Yeoreum Yun
Hi Will,
> > On 20/01/2026 09:29, Yeoreum Yun wrote:
> > > Hi Ryan
> > >> On 19/01/2026 21:24, Yeoreum Yun wrote:
> > >>> Hi Will,
> > >>>
> > >>>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
> > >>>>> +static int __init linear_map_prealloc_split_pgtables(void)
> > >>>>> +{
> > >>>>> + int ret, i;
> > >>>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> > >>>>> + unsigned long lend = PAGE_END;
> > >>>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
> > >>>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
> > >>>>> +
> > >>>>> + const struct mm_walk_ops collect_to_split_ops = {
> > >>>>> + .pud_entry = collect_to_split_pud_entry,
> > >>>>> + .pmd_entry = collect_to_split_pmd_entry
> > >>>>> + };
> > >>>>
> > >>>> Why do we need to rewalk the page-table here instead of collating the
> > >>>> number of block mappings we put down when creating the linear map in
> > >>>> the first place?
> > >>
> > >> That's a good point; perhaps we can reuse the counters that this series introduces?
> > >>
> > >> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
> > >>
> > >>>
> > >>> First, linear alias of the [_text, __init_begin) is not a target for
> > >>> the split and it also seems strange to me to add code inside alloc_init_XXX()
> > >>> that both checks an address range and counts to get the number of block mappings.
> > >>>
> > >>> Second, for a future feature,
> > >>> I hope to add some code to split "specfic" area to be spilt e.x)
> > >>> to set a specific pkey for specific area.
> > >>
> > >> Could you give more detail on this? My working assumption is that either the
> > >> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
> > >> linear map. If it does, we already have logic to split parts of the linear map
> > >> when needed.
> > >
> > > This is not for a linear mapping case. but for a "kernel text area".
> > > As a draft, I want to mark some of kernel code can executable
> > > both kernel and eBPF program.
> > > (I'm trying to make eBPF program non-executable kernel code directly
> > > with POE feature).
> > > For this "executable area" both of kernel and eBPF program
> > > -- typical example is exception entry, It need to split that specific
> > > range and mark them with special POE index.
> >
> > Ahh yes, I recall you mentioning this a while back (although I confess all the
> > deatils have fallen out of my head). You'd need to make sure you're definitely
> > not splitting an area of text that the secondary CPUs are executing while they
> > are being held in the pen, since at least one of those CPUs doesn't support BBML2.
> >
> > >
> > >>
> > >>>
> > >>> In this case, it's useful to rewalk the page-table with the specific
> > >>> range to get the number of block mapping.
> > >>>
> > >>>>
> > >>>>> + split_pgtables_idx = 0;
> > >>>>> + split_pgtables_count = 0;
> > >>>>> +
> > >>>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> > >>>>> + &collect_to_split_ops,
> > >>>>> + NULL, NULL);
> > >>>>> + if (!ret)
> > >>>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
> > >>>>> + &collect_to_split_ops,
> > >>>>> + NULL, NULL);
> > >>>>> + if (ret || !split_pgtables_count)
> > >>>>> + goto error;
>
> Just noticed this, but why do we check '!split_pgtables_count' here?
> if the page-table is already somehow mapped at page granularity, that
> doesn't necessarily sound like a fatal error to me.
!split_pgtables_count is not a "error" case, but just to skip
the remaining logic.
Might the label name "error" makes you confused?
>
> > >>>>> +
> > >>>>> + ret = -ENOMEM;
> > >>>>> +
> > >>>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
> > >>>>> + GFP_KERNEL | __GFP_ZERO);
> > >>>>> + if (!split_pgtables)
> > >>>>> + goto error;
> > >>>>> +
> > >>>>> + for (i = 0; i < split_pgtables_count; i++) {
> > >>>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
> > >>>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
> > >>>>> + if (!split_pgtables[i])
> > >>>>> + goto error;
> > >>>>
> > >>>> This looks potentially expensive on the boot path and only gets worse as
> > >>>> the amount of memory grows. Maybe we should predicate this preallocation
> > >>>> on preempt-rt?
> > >>>
> > >>> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
> > >>
> > >> I guess I'm missing something obvious but I don't understand the problem here...
> > >> We are only deferring the allocation of all these pgtables, so the cost is
> > >> neutral surely? Had we correctly guessed that the system doesn't support BBML2
> > >> earlier, we would have had to allocate all these pgtables earlier.
> > >>
> > >> Another way to look at it is that we are still allocating the same number of
> > >> pgtables in the existing fallback path, it's just that we are doing it inside
> > >> the stop_machine().
> > >>
> > >> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
> > >> with significantly less testing...
> > >
> > > IIUC, Will's mention is additional memory allocation for
> > > "split_pgtables" where saved "pre-allocate" page tables.
> > > As the memory increase, definitely this size would increase the cost.
> >
> > Err, so you're referring to the extra kvmalloc()? I don't think that's a big
> > deal is it? you get 512 pointers per page. So the amortized cost is 1/512= 0.2%?
>
> Right, it was the page-table pages I was worried about not the array of
> pointers.
>
> > I suspect we have both misunderstood Will's point...
>
> I probably just got confused by linear_map_free_split_pgtables() as it
> has logic to free unused page-table pages between 'split_pgtables_idx'
> and 'split_pgtables_count', implying that we can over-allocate.
>
> If that is only needed for the error path in
> linear_map_prealloc_split_pgtables(), then perhaps that part should be
> inlined to deal with the case where we fail to allocate part way through.
>
> Will
This is not for the case of "over-allocate" at the first time.
But for a some case that another CPU caused a split between figuring
out the required number of tables and actually doing the full split
(though it seems unlikely) which Ryan point out in here:
- https://lore.kernel.org/all/73ced1db-a2e2-49ea-927e-9fc4a30e771e@arm.com/
So, it's also required for case of success too.
--
Sincerely,
Yeoreum Yun
> >>>> On Mon, Jan 05, 2026 at 08:23:27PM +0000, Yeoreum Yun wrote:
> >>>>> +static int __init linear_map_prealloc_split_pgtables(void)
> >>>>> +{
> >>>>> + int ret, i;
> >>>>> + unsigned long lstart = _PAGE_OFFSET(vabits_actual);
> >>>>> + unsigned long lend = PAGE_END;
> >>>>> + unsigned long kstart = (unsigned long)lm_alias(_stext);
> >>>>> + unsigned long kend = (unsigned long)lm_alias(__init_begin);
> >>>>> +
> >>>>> + const struct mm_walk_ops collect_to_split_ops = {
> >>>>> + .pud_entry = collect_to_split_pud_entry,
> >>>>> + .pmd_entry = collect_to_split_pmd_entry
> >>>>> + };
> >>>>
> >>>> Why do we need to rewalk the page-table here instead of collating the
> >>>> number of block mappings we put down when creating the linear map in
> >>>> the first place?
> >>
> >> That's a good point; perhaps we can reuse the counters that this series introduces?
> >>
> >> https://lore.kernel.org/all/20260107002944.2940963-1-yang@os.amperecomputing.com/
> >>
> >>>
> >>> First, linear alias of the [_text, __init_begin) is not a target for
> >>> the split and it also seems strange to me to add code inside alloc_init_XXX()
> >>> that both checks an address range and counts to get the number of block mappings.
> >>>
> >>> Second, for a future feature,
> >>> I hope to add some code to split "specfic" area to be spilt e.x)
> >>> to set a specific pkey for specific area.
> >>
> >> Could you give more detail on this? My working assumption is that either the
> >> system supports BBML2 or it doesn't. If it doesn't, we need to split the whole
> >> linear map. If it does, we already have logic to split parts of the linear map
> >> when needed.
> >
> > This is not for a linear mapping case. but for a "kernel text area".
> > As a draft, I want to mark some of kernel code can executable
> > both kernel and eBPF program.
> > (I'm trying to make eBPF program non-executable kernel code directly
> > with POE feature).
> > For this "executable area" both of kernel and eBPF program
> > -- typical example is exception entry, It need to split that specific
> > range and mark them with special POE index.
>
> Ahh yes, I recall you mentioning this a while back (although I confess all the
> deatils have fallen out of my head). You'd need to make sure you're definitely
> not splitting an area of text that the secondary CPUs are executing while they
> are being held in the pen, since at least one of those CPUs doesn't support BBML2.
>
Absoultely. Anyway for that feature, I hope to sustain the current
way -- collect, pre-allocate and use them for specific ranges.
> >
> >>
> >>>
> >>> In this case, it's useful to rewalk the page-table with the specific
> >>> range to get the number of block mapping.
> >>>
> >>>>
> >>>>> + split_pgtables_idx = 0;
> >>>>> + split_pgtables_count = 0;
> >>>>> +
> >>>>> + ret = walk_kernel_page_table_range_lockless(lstart, kstart,
> >>>>> + &collect_to_split_ops,
> >>>>> + NULL, NULL);
> >>>>> + if (!ret)
> >>>>> + ret = walk_kernel_page_table_range_lockless(kend, lend,
> >>>>> + &collect_to_split_ops,
> >>>>> + NULL, NULL);
> >>>>> + if (ret || !split_pgtables_count)
> >>>>> + goto error;
> >>>>> +
> >>>>> + ret = -ENOMEM;
> >>>>> +
> >>>>> + split_pgtables = kvmalloc(split_pgtables_count * sizeof(struct ptdesc *),
> >>>>> + GFP_KERNEL | __GFP_ZERO);
> >>>>> + if (!split_pgtables)
> >>>>> + goto error;
> >>>>> +
> >>>>> + for (i = 0; i < split_pgtables_count; i++) {
> >>>>> + /* The page table will be filled during splitting, so zeroing it is unnecessary. */
> >>>>> + split_pgtables[i] = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
> >>>>> + if (!split_pgtables[i])
> >>>>> + goto error;
> >>>>
> >>>> This looks potentially expensive on the boot path and only gets worse as
> >>>> the amount of memory grows. Maybe we should predicate this preallocation
> >>>> on preempt-rt?
> >>>
> >>> Agree. then I'll apply pre-allocation with PREEMPT_RT only.
> >>
> >> I guess I'm missing something obvious but I don't understand the problem here...
> >> We are only deferring the allocation of all these pgtables, so the cost is
> >> neutral surely? Had we correctly guessed that the system doesn't support BBML2
> >> earlier, we would have had to allocate all these pgtables earlier.
> >>
> >> Another way to look at it is that we are still allocating the same number of
> >> pgtables in the existing fallback path, it's just that we are doing it inside
> >> the stop_machine().
> >>
> >> My vote would be _not_ to have a separate path for PREEMPT_RT, which will end up
> >> with significantly less testing...
> >
> > IIUC, Will's mention is additional memory allocation for
> > "split_pgtables" where saved "pre-allocate" page tables.
> > As the memory increase, definitely this size would increase the cost.
>
> Err, so you're referring to the extra kvmalloc()? I don't think that's a big
> deal is it? you get 512 pointers per page. So the amortized cost is 1/512= 0.2%?
>
> I suspect we have both misunderstood Will's point...
Might be.. sorry for my misunderstanding.
--
Sincerely,
Yeoreum Yun
© 2016 - 2026 Red Hat, Inc.