xen/arch/arm/arm32/mmu/mm.c | 3 +- xen/arch/arm/include/asm/mm.h | 4 +- xen/arch/arm/mm.c | 2 +- xen/arch/arm/mmu/mm.c | 77 ++++++++++++++++++++++++----------- xen/arch/arm/mpu/mm.c | 23 ++++++----- 5 files changed, 70 insertions(+), 39 deletions(-)
Refactor setup_frametable_mappings() into init_frametable(), modeled
after x86's implementation. Instead of mapping one contiguous frametable
covering ram_start to ram_end (including holes), iterate the
pdx_group_valid bitmap to allocate and map frametable memory only for
valid PDX groups, skipping gaps in the physical address space. At the
moment we don't really take into account pdx_group_valid bitmap.
This reduces memory consumption on systems with sparse RAM layouts by
not allocating frametable entries for non-existent memory regions.
A file-local pdx_to_page() override is needed because the generic macro
in xen/include/xen/pdx.h does not account for ARM's non-zero
frametable_base_pdx.
Update the MPU implementation to match the new init_frametable()
signature. Since MPU has no virtual address translation (ma == va),
hole-skipping is not possible and the frametable remains a single
contiguous allocation.
Signed-off-by: Michal Orzel <michal.orzel@amd.com>
---
We've been using this approach at AMD for a while now. Without this we would not
be able to boot some of our boards that have huge holes in the PA space, so I
consider this patch a great improvement.
Two things to consider as a follow-up in the future:
- change generic pdx_to_page, page_to_pdx to take into account offset that
on x86 is zero but on other arches it is not. The page list code is
for now unaffected because the offset cancels out,
- use the same on RISCV.
---
xen/arch/arm/arm32/mmu/mm.c | 3 +-
xen/arch/arm/include/asm/mm.h | 4 +-
xen/arch/arm/mm.c | 2 +-
xen/arch/arm/mmu/mm.c | 77 ++++++++++++++++++++++++-----------
xen/arch/arm/mpu/mm.c | 23 ++++++-----
5 files changed, 70 insertions(+), 39 deletions(-)
diff --git a/xen/arch/arm/arm32/mmu/mm.c b/xen/arch/arm/arm32/mmu/mm.c
index 5e4766ddcf65..0b595baa11b3 100644
--- a/xen/arch/arm/arm32/mmu/mm.c
+++ b/xen/arch/arm/arm32/mmu/mm.c
@@ -178,8 +178,7 @@ void __init setup_mm(void)
setup_directmap_mappings(mfn_x(directmap_mfn_start), xenheap_pages);
- /* Frame table covers all of RAM region, including holes */
- setup_frametable_mappings(ram_start, ram_end);
+ init_frametable(ram_start);
/*
* The allocators may need to use map_domain_page() (such as for
diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
index 72a692862420..2eb8465aa904 100644
--- a/xen/arch/arm/include/asm/mm.h
+++ b/xen/arch/arm/include/asm/mm.h
@@ -196,8 +196,8 @@ extern void *early_fdt_map(paddr_t fdt_paddr);
extern void remove_early_mappings(void);
/* Prepare the memory subystem to bring-up the given secondary CPU */
extern int prepare_secondary_mm(int cpu);
-/* Map a frame table to cover physical addresses ps through pe */
-extern void setup_frametable_mappings(paddr_t ps, paddr_t pe);
+/* Map a frame table */
+void init_frametable(paddr_t ram_start);
/* Helper function to setup memory management */
void setup_mm_helper(void);
/* map a physical range in virtual memory */
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index 6df8b616e464..e6b651956927 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -62,7 +62,7 @@ void __init setup_mm(void)
setup_mm_helper();
- setup_frametable_mappings(ram_start, ram_end);
+ init_frametable(ram_start);
init_staticmem_pages();
init_sharedmem_pages();
diff --git a/xen/arch/arm/mmu/mm.c b/xen/arch/arm/mmu/mm.c
index 6604f3bf4e6a..4b4da349c16c 100644
--- a/xen/arch/arm/mmu/mm.c
+++ b/xen/arch/arm/mmu/mm.c
@@ -8,16 +8,37 @@
#include <xen/pdx.h>
#include <xen/string.h>
-/* Map a frame table to cover physical addresses ps through pe */
-void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
+#undef pdx_to_page
+#define pdx_to_page(pdx) gcc11_wrap(frame_table + ((pdx) - frametable_base_pdx))
+
+static void __init
+init_frametable_chunk(unsigned long pdx_s, unsigned long pdx_e)
{
- unsigned long nr_pdxs = mfn_to_pdx(mfn_add(maddr_to_mfn(pe), -1)) -
- mfn_to_pdx(maddr_to_mfn(ps)) + 1;
- unsigned long frametable_size = nr_pdxs * sizeof(struct page_info);
- mfn_t base_mfn;
- const unsigned long mapping_size = frametable_size < MB(32) ? MB(2)
- : MB(32);
+ unsigned long nr_pdxs = pdx_e - pdx_s;
+ unsigned long chunk_size = nr_pdxs * sizeof(struct page_info);
+ const unsigned long mapping_size = chunk_size < MB(32) ? MB(2) : MB(32);
+ unsigned long virt;
int rc;
+ mfn_t base_mfn;
+
+ /* Round up to 2M or 32M boundary, as appropriate. */
+ chunk_size = ROUNDUP(chunk_size, mapping_size);
+ base_mfn = alloc_boot_pages(chunk_size >> PAGE_SHIFT, 32 << (20 - 12));
+
+ virt = (unsigned long)pdx_to_page(pdx_s);
+ rc = map_pages_to_xen(virt, base_mfn, chunk_size >> PAGE_SHIFT,
+ PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
+ if ( rc )
+ panic("Unable to setup the frametable mappings\n");
+
+ memset(pdx_to_page(pdx_s), 0, nr_pdxs * sizeof(struct page_info));
+ memset(pdx_to_page(pdx_e), -1,
+ chunk_size - nr_pdxs * sizeof(struct page_info));
+}
+
+void __init init_frametable(paddr_t ram_start)
+{
+ unsigned int sidx, nidx, max_idx;
/*
* The size of paddr_t should be sufficient for the complete range of
@@ -26,24 +47,34 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
- if ( frametable_size > FRAMETABLE_SIZE )
- panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
- ps, pe);
+ max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT);
+ frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
- frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ps));
- /* Round up to 2M or 32M boundary, as appropriate. */
- frametable_size = ROUNDUP(frametable_size, mapping_size);
- base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12));
+ /*
+ * pdx_to_page(pdx_s) in init_frametable_chunk must be page-aligned
+ * for map_pages_to_xen(). Aligning to PDX_GROUP_COUNT guarantees this
+ * because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
+ * PAGE_SIZE by construction.
+ */
+ frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
- rc = map_pages_to_xen(FRAMETABLE_VIRT_START, base_mfn,
- frametable_size >> PAGE_SHIFT,
- PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
- if ( rc )
- panic("Unable to setup the frametable mappings.\n");
+ if ( (max_pdx - frametable_base_pdx) > FRAMETABLE_NR )
+ panic("Frametable too small\n");
+
+ for ( sidx = (frametable_base_pdx / PDX_GROUP_COUNT); ; sidx = nidx )
+ {
+ unsigned int eidx;
+
+ eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
+ nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
+
+ if ( nidx >= max_idx )
+ break;
+
+ init_frametable_chunk(sidx * PDX_GROUP_COUNT, eidx * PDX_GROUP_COUNT);
+ }
- memset(&frame_table[0], 0, nr_pdxs * sizeof(struct page_info));
- memset(&frame_table[nr_pdxs], -1,
- frametable_size - (nr_pdxs * sizeof(struct page_info)));
+ init_frametable_chunk(sidx * PDX_GROUP_COUNT, max_pdx);
}
/*
diff --git a/xen/arch/arm/mpu/mm.c b/xen/arch/arm/mpu/mm.c
index aff88bd3a9c1..9c568831c128 100644
--- a/xen/arch/arm/mpu/mm.c
+++ b/xen/arch/arm/mpu/mm.c
@@ -186,16 +186,15 @@ static int is_mm_attr_match(pr_t *region, unsigned int attributes)
return 0;
}
-/* Map a frame table to cover physical addresses ps through pe */
-void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
+/*
+ * Allocate a contiguous frame table covering ram_start through max_pdx.
+ * Unlike the MMU version, MPU cannot skip holes because there is no virtual
+ * address translation (ma == va).
+ */
+void __init init_frametable(paddr_t ram_start)
{
+ unsigned long nr_pdxs, frametable_size;
mfn_t base_mfn;
- paddr_t aligned_ps = ROUNDUP(ps, PAGE_SIZE);
- paddr_t aligned_pe = ROUNDDOWN(pe, PAGE_SIZE);
-
- unsigned long nr_pdxs = mfn_to_pdx(mfn_add(maddr_to_mfn(aligned_pe), -1)) -
- mfn_to_pdx(maddr_to_mfn(aligned_ps)) + 1;
- unsigned long frametable_size = nr_pdxs * sizeof(struct page_info);
/*
* The size of paddr_t should be sufficient for the complete range of
@@ -204,11 +203,13 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
+ frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
+ nr_pdxs = max_pdx - frametable_base_pdx;
+ frametable_size = nr_pdxs * sizeof(struct page_info);
+
if ( frametable_size > FRAMETABLE_SIZE )
- panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
- ps, pe);
+ panic("Frametable too small\n");
- frametable_base_pdx = paddr_to_pdx(aligned_ps);
frametable_size = ROUNDUP(frametable_size, PAGE_SIZE);
base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 1);
--
2.43.0
Hi Michal,
On 17/04/2026 10:11, Michal Orzel wrote:
> Refactor setup_frametable_mappings() into init_frametable(), modeled
> after x86's implementation. Instead of mapping one contiguous frametable
> covering ram_start to ram_end (including holes), iterate the
> pdx_group_valid bitmap to allocate and map frametable memory only for
> valid PDX groups, skipping gaps in the physical address space. At the
> moment we don't really take into account pdx_group_valid bitmap.
>
> This reduces memory consumption on systems with sparse RAM layouts by
> not allocating frametable entries for non-existent memory regions.
>
> A file-local pdx_to_page() override is needed because the generic macro
> in xen/include/xen/pdx.h does not account for ARM's non-zero
> frametable_base_pdx.
Can you provide a bit more details? I am a bit concerned that this could
result to subttle bug in the future if code within mm.c is expecting the
original behavior. It would be preferable if the change is either for
everyone on Arm or the function is renamed to avoid any clash.
[...]
> +void __init init_frametable(paddr_t ram_start)
> +{
> + unsigned int sidx, nidx, max_idx;
>
> /*
> * The size of paddr_t should be sufficient for the complete range of
> @@ -26,24 +47,34 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>
> - if ( frametable_size > FRAMETABLE_SIZE )
> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
> - ps, pe);
> + max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT);
> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
>
> - frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ps));
> - /* Round up to 2M or 32M boundary, as appropriate. */
> - frametable_size = ROUNDUP(frametable_size, mapping_size);
> - base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12));
> + /*
> + * pdx_to_page(pdx_s) in init_frametable_chunk must be page-aligned
> + * for map_pages_to_xen(). Aligning to PDX_GROUP_COUNT guarantees this
> + * because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
> + * PAGE_SIZE by construction.
> + */
> + frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
>
> - rc = map_pages_to_xen(FRAMETABLE_VIRT_START, base_mfn,
> - frametable_size >> PAGE_SHIFT,
> - PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
> - if ( rc )
> - panic("Unable to setup the frametable mappings.\n");
> + if ( (max_pdx - frametable_base_pdx) > FRAMETABLE_NR )
> + panic("Frametable too small\n");
> +
> + for ( sidx = (frametable_base_pdx / PDX_GROUP_COUNT); ; sidx = nidx )
> + {
> + unsigned int eidx;
> +
> + eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
> + nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
> +
> + if ( nidx >= max_idx )
> + break;
> +
> + init_frametable_chunk(sidx * PDX_GROUP_COUNT, eidx * PDX_GROUP_COUNT);
The function will do a round-up the mapping to either a 2MiB or 32MiB
aligned size. This means we could potentially cover the previous mapped
region or the next one. I can't seem to find any code to cover this
use-case. What did I miss?
Cheers,
--
Julien Grall
On 22/04/2026 09:01, Julien Grall wrote:
> Hi Michal,
>
> On 17/04/2026 10:11, Michal Orzel wrote:
>> Refactor setup_frametable_mappings() into init_frametable(), modeled
>> after x86's implementation. Instead of mapping one contiguous frametable
>> covering ram_start to ram_end (including holes), iterate the
>> pdx_group_valid bitmap to allocate and map frametable memory only for
>> valid PDX groups, skipping gaps in the physical address space. At the
>> moment we don't really take into account pdx_group_valid bitmap.
>>
>> This reduces memory consumption on systems with sparse RAM layouts by
>> not allocating frametable entries for non-existent memory regions.
>>
>> A file-local pdx_to_page() override is needed because the generic macro
>> in xen/include/xen/pdx.h does not account for ARM's non-zero
>> frametable_base_pdx.
>
> Can you provide a bit more details? I am a bit concerned that this could
> result to subttle bug in the future if code within mm.c is expecting the
> original behavior. It would be preferable if the change is either for
> everyone on Arm or the function is renamed to avoid any clash.
The generic pdx_to_page macro does not account for offset which is something I
mentioned in the footer and I'm willing to work on in the future. As of today,
this macro is *unused* on Arm. It's only used by x86 in some special big mem
related scenario. Using generic pdx_to_page on Arm would be wrong, so a future
patch doing that would be wrong (the fact that this patch adds a local redefine
does not change anything). Do we need a rename for a local redefine in a file
that is only related to frametable? Maybe a comment and a TODO would be ok?
>
> [...]
>
>> +void __init init_frametable(paddr_t ram_start)
>> +{
>> + unsigned int sidx, nidx, max_idx;
>>
>> /*
>> * The size of paddr_t should be sufficient for the complete range of
>> @@ -26,24 +47,34 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
>> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
>> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>>
>> - if ( frametable_size > FRAMETABLE_SIZE )
>> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
>> - ps, pe);
>> + max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT);
>> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
>>
>> - frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ps));
>> - /* Round up to 2M or 32M boundary, as appropriate. */
>> - frametable_size = ROUNDUP(frametable_size, mapping_size);
>> - base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12));
>> + /*
>> + * pdx_to_page(pdx_s) in init_frametable_chunk must be page-aligned
>> + * for map_pages_to_xen(). Aligning to PDX_GROUP_COUNT guarantees this
>> + * because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
>> + * PAGE_SIZE by construction.
>> + */
>> + frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
>>
>> - rc = map_pages_to_xen(FRAMETABLE_VIRT_START, base_mfn,
>> - frametable_size >> PAGE_SHIFT,
>> - PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
>> - if ( rc )
>> - panic("Unable to setup the frametable mappings.\n");
>> + if ( (max_pdx - frametable_base_pdx) > FRAMETABLE_NR )
>> + panic("Frametable too small\n");
>> +
>> + for ( sidx = (frametable_base_pdx / PDX_GROUP_COUNT); ; sidx = nidx )
>> + {
>> + unsigned int eidx;
>> +
>> + eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
>> + nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
>> +
>> + if ( nidx >= max_idx )
>> + break;
>> +
>> + init_frametable_chunk(sidx * PDX_GROUP_COUNT, eidx * PDX_GROUP_COUNT);
>
> The function will do a round-up the mapping to either a 2MiB or 32MiB
> aligned size. This means we could potentially cover the previous mapped
> region or the next one. I can't seem to find any code to cover this
> use-case. What did I miss?
Hmm, I think I calculated something wrong here. Anyway, how about using 2MB
mapping size all the time? PDX group size is 2MB, in-loop chunks are multiple of
2MB, there is no roundup needed - zero overshoot. The last chunk may have ~2MB
overshoot but it does not matter as there is nothing after it to conflict with.
The downside is more TLB pressure.
Alternatively, we could reduce the mapping size closer to boundaries (x86
choice) but that would require a bit more work.
~Michal
Hi Michal,
On 22/04/2026 09:25, Orzel, Michal wrote:
>
>
> On 22/04/2026 09:01, Julien Grall wrote:
>> Hi Michal,
>>
>> On 17/04/2026 10:11, Michal Orzel wrote:
>>> Refactor setup_frametable_mappings() into init_frametable(), modeled
>>> after x86's implementation. Instead of mapping one contiguous frametable
>>> covering ram_start to ram_end (including holes), iterate the
>>> pdx_group_valid bitmap to allocate and map frametable memory only for
>>> valid PDX groups, skipping gaps in the physical address space. At the
>>> moment we don't really take into account pdx_group_valid bitmap.
>>>
>>> This reduces memory consumption on systems with sparse RAM layouts by
>>> not allocating frametable entries for non-existent memory regions.
>>>
>>> A file-local pdx_to_page() override is needed because the generic macro
>>> in xen/include/xen/pdx.h does not account for ARM's non-zero
>>> frametable_base_pdx.
>>
>> Can you provide a bit more details? I am a bit concerned that this could
>> result to subttle bug in the future if code within mm.c is expecting the
>> original behavior. It would be preferable if the change is either for
>> everyone on Arm or the function is renamed to avoid any clash.
> The generic pdx_to_page macro does not account for offset which is something I
> mentioned in the footer and I'm willing to work on in the future.
Sorry I missed the comment in the footer. But if the function is broken,
then why can't we implement pdx_to_page() correctly now? I understand
that ...
As of today,
> this macro is *unused* on Arm. It's only used by x86 in some special big mem
> related scenario. Using generic pdx_to_page on Arm would be wrong, so a future
> patch doing that would be wrong (the fact that this patch adds a local redefine
> does not change anything). Do we need a rename for a local redefine in a file
> that is only related to frametable? Maybe a comment and a TODO would be ok?
... this is not meant to be used by Arm today. But given this is used in
the page list, it is definitely not obvious that it is broken.
The alternative is to protect/move pdx_to_page() in x86. But I don't
know much churn this would involve.
>
>>
>> [...]
>>
>>> +void __init init_frametable(paddr_t ram_start)
>>> +{
>>> + unsigned int sidx, nidx, max_idx;
>>>
>>> /*
>>> * The size of paddr_t should be sufficient for the complete range of
>>> @@ -26,24 +47,34 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
>>> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
>>> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>>>
>>> - if ( frametable_size > FRAMETABLE_SIZE )
>>> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
>>> - ps, pe);
>>> + max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT);
>>> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
>>>
>>> - frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ps));
>>> - /* Round up to 2M or 32M boundary, as appropriate. */
>>> - frametable_size = ROUNDUP(frametable_size, mapping_size);
>>> - base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12));
>>> + /*
>>> + * pdx_to_page(pdx_s) in init_frametable_chunk must be page-aligned
>>> + * for map_pages_to_xen(). Aligning to PDX_GROUP_COUNT guarantees this
>>> + * because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
>>> + * PAGE_SIZE by construction.
>>> + */
>>> + frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
>>>
>>> - rc = map_pages_to_xen(FRAMETABLE_VIRT_START, base_mfn,
>>> - frametable_size >> PAGE_SHIFT,
>>> - PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
>>> - if ( rc )
>>> - panic("Unable to setup the frametable mappings.\n");
>>> + if ( (max_pdx - frametable_base_pdx) > FRAMETABLE_NR )
>>> + panic("Frametable too small\n");
>>> +
>>> + for ( sidx = (frametable_base_pdx / PDX_GROUP_COUNT); ; sidx = nidx )
>>> + {
>>> + unsigned int eidx;
>>> +
>>> + eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
>>> + nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
>>> +
>>> + if ( nidx >= max_idx )
>>> + break;
>>> +
>>> + init_frametable_chunk(sidx * PDX_GROUP_COUNT, eidx * PDX_GROUP_COUNT);
>>
>> The function will do a round-up the mapping to either a 2MiB or 32MiB
>> aligned size. This means we could potentially cover the previous mapped
>> region or the next one. I can't seem to find any code to cover this
>> use-case. What did I miss?
> Hmm, I think I calculated something wrong here. Anyway, how about using 2MB
> mapping size all the time? PDX group size is 2MB,
Looking at the code, it seems to be based on SECOND_SHIFT which
technically depends on the page granularity. Even though Xen supports
only 4KiB, we are trying to avoid making such assumption or add least
adding a BUILD_BUG_ON() (in this case, I would consider that
PDX_GROUP_COUNT is always 2MiB or SECOND_SHIFT).
> in-loop chunks are multiple of
> 2MB, there is no roundup needed - zero overshoot. The last chunk may have ~2MB
> overshoot but it does not matter as there is nothing after it to conflict with.
> The downside is more TLB pressure.
I am a bit warry to modify the frametable allocation method because it
is used fairly often in Xen. Would it be possible to hence the loop to
detect contiguous chunk and decide the size allocation based on the chunk?
>
> Alternatively, we could reduce the mapping size closer to boundaries (x86
> choice) but that would require a bit more work.
This would technically have the same downside as above.
Cheers,
--
Julien Grall
On 29-Apr-26 16:42, Julien Grall wrote:
> Hi Michal,
>
> On 22/04/2026 09:25, Orzel, Michal wrote:
>>
>>
>> On 22/04/2026 09:01, Julien Grall wrote:
>>> Hi Michal,
>>>
>>> On 17/04/2026 10:11, Michal Orzel wrote:
>>>> Refactor setup_frametable_mappings() into init_frametable(), modeled
>>>> after x86's implementation. Instead of mapping one contiguous frametable
>>>> covering ram_start to ram_end (including holes), iterate the
>>>> pdx_group_valid bitmap to allocate and map frametable memory only for
>>>> valid PDX groups, skipping gaps in the physical address space. At the
>>>> moment we don't really take into account pdx_group_valid bitmap.
>>>>
>>>> This reduces memory consumption on systems with sparse RAM layouts by
>>>> not allocating frametable entries for non-existent memory regions.
>>>>
>>>> A file-local pdx_to_page() override is needed because the generic macro
>>>> in xen/include/xen/pdx.h does not account for ARM's non-zero
>>>> frametable_base_pdx.
>>>
>>> Can you provide a bit more details? I am a bit concerned that this could
>>> result to subttle bug in the future if code within mm.c is expecting the
>>> original behavior. It would be preferable if the change is either for
>>> everyone on Arm or the function is renamed to avoid any clash.
>> The generic pdx_to_page macro does not account for offset which is something I
>> mentioned in the footer and I'm willing to work on in the future.
>
> Sorry I missed the comment in the footer. But if the function is broken,
> then why can't we implement pdx_to_page() correctly now? I understand
> that ...
I wanted to do this in the future but ok, will do in v2.
>
> As of today,
>> this macro is *unused* on Arm. It's only used by x86 in some special big mem
>> related scenario. Using generic pdx_to_page on Arm would be wrong, so a future
>> patch doing that would be wrong (the fact that this patch adds a local redefine
>> does not change anything). Do we need a rename for a local redefine in a file
>> that is only related to frametable? Maybe a comment and a TODO would be ok?
>
> ... this is not meant to be used by Arm today. But given this is used in
> the page list, it is definitely not obvious that it is broken.
>
> The alternative is to protect/move pdx_to_page() in x86. But I don't
> know much churn this would involve.
>
>>
>>>
>>> [...]
>>>
>>>> +void __init init_frametable(paddr_t ram_start)
>>>> +{
>>>> + unsigned int sidx, nidx, max_idx;
>>>>
>>>> /*
>>>> * The size of paddr_t should be sufficient for the complete range of
>>>> @@ -26,24 +47,34 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
>>>> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
>>>> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>>>>
>>>> - if ( frametable_size > FRAMETABLE_SIZE )
>>>> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
>>>> - ps, pe);
>>>> + max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT);
>>>> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
>>>>
>>>> - frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ps));
>>>> - /* Round up to 2M or 32M boundary, as appropriate. */
>>>> - frametable_size = ROUNDUP(frametable_size, mapping_size);
>>>> - base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12));
>>>> + /*
>>>> + * pdx_to_page(pdx_s) in init_frametable_chunk must be page-aligned
>>>> + * for map_pages_to_xen(). Aligning to PDX_GROUP_COUNT guarantees this
>>>> + * because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
>>>> + * PAGE_SIZE by construction.
>>>> + */
>>>> + frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
>>>>
>>>> - rc = map_pages_to_xen(FRAMETABLE_VIRT_START, base_mfn,
>>>> - frametable_size >> PAGE_SHIFT,
>>>> - PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
>>>> - if ( rc )
>>>> - panic("Unable to setup the frametable mappings.\n");
>>>> + if ( (max_pdx - frametable_base_pdx) > FRAMETABLE_NR )
>>>> + panic("Frametable too small\n");
>>>> +
>>>> + for ( sidx = (frametable_base_pdx / PDX_GROUP_COUNT); ; sidx = nidx )
>>>> + {
>>>> + unsigned int eidx;
>>>> +
>>>> + eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
>>>> + nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
>>>> +
>>>> + if ( nidx >= max_idx )
>>>> + break;
>>>> +
>>>> + init_frametable_chunk(sidx * PDX_GROUP_COUNT, eidx * PDX_GROUP_COUNT);
>>>
>>> The function will do a round-up the mapping to either a 2MiB or 32MiB
>>> aligned size. This means we could potentially cover the previous mapped
>>> region or the next one. I can't seem to find any code to cover this
>>> use-case. What did I miss?
>> Hmm, I think I calculated something wrong here. Anyway, how about using 2MB
>> mapping size all the time? PDX group size is 2MB,
>
> Looking at the code, it seems to be based on SECOND_SHIFT which
> technically depends on the page granularity. Even though Xen supports
> only 4KiB, we are trying to avoid making such assumption or add least
> adding a BUILD_BUG_ON() (in this case, I would consider that
> PDX_GROUP_COUNT is always 2MiB or SECOND_SHIFT).
>
>> in-loop chunks are multiple of
>> 2MB, there is no roundup needed - zero overshoot. The last chunk may have ~2MB
>> overshoot but it does not matter as there is nothing after it to conflict with.
>> The downside is more TLB pressure.
>
> I am a bit warry to modify the frametable allocation method because it
> is used fairly often in Xen. Would it be possible to hence the loop to
> detect contiguous chunk and decide the size allocation based on the chunk?
To eliminate overshoot I'll round up to page size. In-loop chunks are already
page aligned so the round up is a no op except for the last chunk.
What we care about is to reduce TLB pressure by setting wherever possible
contiguous bit. It's not lost because map_pages_to_xen with block flag handles
contiguous internally via xen_pt_check_contig. Will do in v2.
~Michal
Hello Michał, On 4/17/26 11:11 AM, Michal Orzel wrote: > Refactor setup_frametable_mappings() into init_frametable(), modeled > after x86's implementation. Instead of mapping one contiguous frametable > covering ram_start to ram_end (including holes), iterate the > pdx_group_valid bitmap to allocate and map frametable memory only for > valid PDX groups, skipping gaps in the physical address space. At the > moment we don't really take into account pdx_group_valid bitmap. > > This reduces memory consumption on systems with sparse RAM layouts by > not allocating frametable entries for non-existent memory regions. > > A file-local pdx_to_page() override is needed because the generic macro > in xen/include/xen/pdx.h does not account for ARM's non-zero > frametable_base_pdx. > > Update the MPU implementation to match the new init_frametable() > signature. Since MPU has no virtual address translation (ma == va), > hole-skipping is not possible and the frametable remains a single > contiguous allocation. > > Signed-off-by: Michal Orzel<michal.orzel@amd.com> > --- > We've been using this approach at AMD for a while now. Without this we would not > be able to boot some of our boards that have huge holes in the PA space, so I > consider this patch a great improvement. > > Two things to consider as a follow-up in the future: > - change generic pdx_to_page, page_to_pdx to take into account offset that > on x86 is zero but on other arches it is not. The page list code is > for now unaffected because the offset cancels out, > - use the same on RISCV. Do you have such plans to do that for RISC-V? ~ Oleksii
On 21/04/2026 15:30, Oleksii Kurochko wrote: > Hello Michał, > > On 4/17/26 11:11 AM, Michal Orzel wrote: >> Refactor setup_frametable_mappings() into init_frametable(), modeled >> after x86's implementation. Instead of mapping one contiguous frametable >> covering ram_start to ram_end (including holes), iterate the >> pdx_group_valid bitmap to allocate and map frametable memory only for >> valid PDX groups, skipping gaps in the physical address space. At the >> moment we don't really take into account pdx_group_valid bitmap. >> >> This reduces memory consumption on systems with sparse RAM layouts by >> not allocating frametable entries for non-existent memory regions. >> >> A file-local pdx_to_page() override is needed because the generic macro >> in xen/include/xen/pdx.h does not account for ARM's non-zero >> frametable_base_pdx. >> >> Update the MPU implementation to match the new init_frametable() >> signature. Since MPU has no virtual address translation (ma == va), >> hole-skipping is not possible and the frametable remains a single >> contiguous allocation. >> >> Signed-off-by: Michal Orzel<michal.orzel@amd.com> >> --- >> We've been using this approach at AMD for a while now. Without this we would not >> be able to boot some of our boards that have huge holes in the PA space, so I >> consider this patch a great improvement. >> >> Two things to consider as a follow-up in the future: >> - change generic pdx_to_page, page_to_pdx to take into account offset that >> on x86 is zero but on other arches it is not. The page list code is >> for now unaffected because the offset cancels out, >> - use the same on RISCV. > > Do you have such plans to do that for RISC-V? My plan for this release cycle is to do this for Arm as we can observe great improvement in space management. Other things that I mentioned I planned for the future. I don't want to add new things on our plate in this release. ~Michal
On 17/04/2026 10:11 am, Michal Orzel wrote:
> Refactor setup_frametable_mappings() into init_frametable(), modeled
> after x86's implementation. Instead of mapping one contiguous frametable
> covering ram_start to ram_end (including holes), iterate the
> pdx_group_valid bitmap to allocate and map frametable memory only for
> valid PDX groups, skipping gaps in the physical address space. At the
> moment we don't really take into account pdx_group_valid bitmap.
>
> This reduces memory consumption on systems with sparse RAM layouts by
> not allocating frametable entries for non-existent memory regions.
>
> A file-local pdx_to_page() override is needed because the generic macro
> in xen/include/xen/pdx.h does not account for ARM's non-zero
> frametable_base_pdx.
>
> Update the MPU implementation to match the new init_frametable()
> signature. Since MPU has no virtual address translation (ma == va),
> hole-skipping is not possible and the frametable remains a single
> contiguous allocation.
>
> Signed-off-by: Michal Orzel <michal.orzel@amd.com>
> ---
> We've been using this approach at AMD for a while now. Without this we would not
> be able to boot some of our boards that have huge holes in the PA space, so I
> consider this patch a great improvement.
>
> Two things to consider as a follow-up in the future:
> - change generic pdx_to_page, page_to_pdx to take into account offset that
> on x86 is zero but on other arches it is not. The page list code is
> for now unaffected because the offset cancels out,
> - use the same on RISCV.
PPC also has frametable_base_pdx and a TODO saying "make this work".
The header file however is horribly tangled and needs some work.
What I think we want is to include <asm/pdx.h> early (if it exists), and
allow it the option to define frametable_base_pdx, but with a common
fallback making it 0.
I think we also want to split out xen/pdx-{mask,offset}.h each taking
their respective chunk of the giant comment. Possibly pdx-none.h too.
This separates the unrelated algorithms, and puts their definitions next
to the comment explaining how it works.
~Andrew
HI Michal,
> On 17 Apr 2026, at 10:11, Michal Orzel <michal.orzel@amd.com> wrote:
>
> Refactor setup_frametable_mappings() into init_frametable(), modeled
> after x86's implementation. Instead of mapping one contiguous frametable
> covering ram_start to ram_end (including holes), iterate the
> pdx_group_valid bitmap to allocate and map frametable memory only for
> valid PDX groups, skipping gaps in the physical address space. At the
> moment we don't really take into account pdx_group_valid bitmap.
>
> This reduces memory consumption on systems with sparse RAM layouts by
> not allocating frametable entries for non-existent memory regions.
>
> A file-local pdx_to_page() override is needed because the generic macro
> in xen/include/xen/pdx.h does not account for ARM's non-zero
> frametable_base_pdx.
>
> Update the MPU implementation to match the new init_frametable()
> signature. Since MPU has no virtual address translation (ma == va),
> hole-skipping is not possible and the frametable remains a single
> contiguous allocation.
>
> Signed-off-by: Michal Orzel <michal.orzel@amd.com>
> ---
> We've been using this approach at AMD for a while now. Without this we would not
> be able to boot some of our boards that have huge holes in the PA space, so I
> consider this patch a great improvement.
>
> Two things to consider as a follow-up in the future:
> - change generic pdx_to_page, page_to_pdx to take into account offset that
> on x86 is zero but on other arches it is not. The page list code is
> for now unaffected because the offset cancels out,
> - use the same on RISCV.
> ---
> xen/arch/arm/arm32/mmu/mm.c | 3 +-
> xen/arch/arm/include/asm/mm.h | 4 +-
> xen/arch/arm/mm.c | 2 +-
> xen/arch/arm/mmu/mm.c | 77 ++++++++++++++++++++++++-----------
> xen/arch/arm/mpu/mm.c | 23 ++++++-----
> 5 files changed, 70 insertions(+), 39 deletions(-)
>
> diff --git a/xen/arch/arm/arm32/mmu/mm.c b/xen/arch/arm/arm32/mmu/mm.c
> index 5e4766ddcf65..0b595baa11b3 100644
> --- a/xen/arch/arm/arm32/mmu/mm.c
> +++ b/xen/arch/arm/arm32/mmu/mm.c
> @@ -178,8 +178,7 @@ void __init setup_mm(void)
>
> setup_directmap_mappings(mfn_x(directmap_mfn_start), xenheap_pages);
>
> - /* Frame table covers all of RAM region, including holes */
> - setup_frametable_mappings(ram_start, ram_end);
> + init_frametable(ram_start);
>
> /*
> * The allocators may need to use map_domain_page() (such as for
> diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
> index 72a692862420..2eb8465aa904 100644
> --- a/xen/arch/arm/include/asm/mm.h
> +++ b/xen/arch/arm/include/asm/mm.h
> @@ -196,8 +196,8 @@ extern void *early_fdt_map(paddr_t fdt_paddr);
> extern void remove_early_mappings(void);
> /* Prepare the memory subystem to bring-up the given secondary CPU */
> extern int prepare_secondary_mm(int cpu);
> -/* Map a frame table to cover physical addresses ps through pe */
> -extern void setup_frametable_mappings(paddr_t ps, paddr_t pe);
> +/* Map a frame table */
NIT: Would /* Initialize the frame table */ fit better the new helper description?
> +void init_frametable(paddr_t ram_start);
> /* Helper function to setup memory management */
> void setup_mm_helper(void);
> /* map a physical range in virtual memory */
> diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
> index 6df8b616e464..e6b651956927 100644
> --- a/xen/arch/arm/mm.c
> +++ b/xen/arch/arm/mm.c
> @@ -62,7 +62,7 @@ void __init setup_mm(void)
>
> setup_mm_helper();
>
> - setup_frametable_mappings(ram_start, ram_end);
> + init_frametable(ram_start);
>
> init_staticmem_pages();
> init_sharedmem_pages();
> diff --git a/xen/arch/arm/mmu/mm.c b/xen/arch/arm/mmu/mm.c
> index 6604f3bf4e6a..4b4da349c16c 100644
> --- a/xen/arch/arm/mmu/mm.c
> +++ b/xen/arch/arm/mmu/mm.c
> @@ -8,16 +8,37 @@
> #include <xen/pdx.h>
> #include <xen/string.h>
>
> -/* Map a frame table to cover physical addresses ps through pe */
> -void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
> +#undef pdx_to_page
> +#define pdx_to_page(pdx) gcc11_wrap(frame_table + ((pdx) - frametable_base_pdx))
> +
> +static void __init
> +init_frametable_chunk(unsigned long pdx_s, unsigned long pdx_e)
> {
> - unsigned long nr_pdxs = mfn_to_pdx(mfn_add(maddr_to_mfn(pe), -1)) -
> - mfn_to_pdx(maddr_to_mfn(ps)) + 1;
> - unsigned long frametable_size = nr_pdxs * sizeof(struct page_info);
> - mfn_t base_mfn;
> - const unsigned long mapping_size = frametable_size < MB(32) ? MB(2)
> - : MB(32);
> + unsigned long nr_pdxs = pdx_e - pdx_s;
> + unsigned long chunk_size = nr_pdxs * sizeof(struct page_info);
> + const unsigned long mapping_size = chunk_size < MB(32) ? MB(2) : MB(32);
> + unsigned long virt;
> int rc;
> + mfn_t base_mfn;
> +
> + /* Round up to 2M or 32M boundary, as appropriate. */
> + chunk_size = ROUNDUP(chunk_size, mapping_size);
> + base_mfn = alloc_boot_pages(chunk_size >> PAGE_SHIFT, 32 << (20 - 12));
> +
> + virt = (unsigned long)pdx_to_page(pdx_s);
> + rc = map_pages_to_xen(virt, base_mfn, chunk_size >> PAGE_SHIFT,
> + PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
> + if ( rc )
> + panic("Unable to setup the frametable mappings\n");
> +
> + memset(pdx_to_page(pdx_s), 0, nr_pdxs * sizeof(struct page_info));
> + memset(pdx_to_page(pdx_e), -1,
> + chunk_size - nr_pdxs * sizeof(struct page_info));
> +}
> +
> +void __init init_frametable(paddr_t ram_start)
> +{
> + unsigned int sidx, nidx, max_idx;
>
> /*
> * The size of paddr_t should be sufficient for the complete range of
> @@ -26,24 +47,34 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>
> - if ( frametable_size > FRAMETABLE_SIZE )
> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
> - ps, pe);
> + max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT);
> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
>
> - frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ps));
> - /* Round up to 2M or 32M boundary, as appropriate. */
> - frametable_size = ROUNDUP(frametable_size, mapping_size);
> - base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12));
> + /*
> + * pdx_to_page(pdx_s) in init_frametable_chunk must be page-aligned
> + * for map_pages_to_xen(). Aligning to PDX_GROUP_COUNT guarantees this
> + * because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
> + * PAGE_SIZE by construction.
> + */
> + frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
We are now rounding down frametable_base_pdx which before this patch it was the start of the ram,
but in xen/xen/arch/arm/include/asm/mm.h, mfn_valid(mfn) is using frametable_base_pdx to check for
mfn validity, this means that we could pass an mfn before the start of the ram and if __mfn_valid is happy,
we are getting a regression.
Can this happen or am I missing something?
>
> - rc = map_pages_to_xen(FRAMETABLE_VIRT_START, base_mfn,
> - frametable_size >> PAGE_SHIFT,
> - PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
> - if ( rc )
> - panic("Unable to setup the frametable mappings.\n");
> + if ( (max_pdx - frametable_base_pdx) > FRAMETABLE_NR )
> + panic("Frametable too small\n");
> +
> + for ( sidx = (frametable_base_pdx / PDX_GROUP_COUNT); ; sidx = nidx )
> + {
> + unsigned int eidx;
> +
> + eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
> + nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
> +
> + if ( nidx >= max_idx )
> + break;
> +
> + init_frametable_chunk(sidx * PDX_GROUP_COUNT, eidx * PDX_GROUP_COUNT);
> + }
>
> - memset(&frame_table[0], 0, nr_pdxs * sizeof(struct page_info));
> - memset(&frame_table[nr_pdxs], -1,
> - frametable_size - (nr_pdxs * sizeof(struct page_info)));
> + init_frametable_chunk(sidx * PDX_GROUP_COUNT, max_pdx);
> }
>
> /*
> diff --git a/xen/arch/arm/mpu/mm.c b/xen/arch/arm/mpu/mm.c
> index aff88bd3a9c1..9c568831c128 100644
> --- a/xen/arch/arm/mpu/mm.c
> +++ b/xen/arch/arm/mpu/mm.c
> @@ -186,16 +186,15 @@ static int is_mm_attr_match(pr_t *region, unsigned int attributes)
> return 0;
> }
>
> -/* Map a frame table to cover physical addresses ps through pe */
> -void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
> +/*
> + * Allocate a contiguous frame table covering ram_start through max_pdx.
> + * Unlike the MMU version, MPU cannot skip holes because there is no virtual
> + * address translation (ma == va).
> + */
> +void __init init_frametable(paddr_t ram_start)
> {
> + unsigned long nr_pdxs, frametable_size;
> mfn_t base_mfn;
> - paddr_t aligned_ps = ROUNDUP(ps, PAGE_SIZE);
> - paddr_t aligned_pe = ROUNDDOWN(pe, PAGE_SIZE);
> -
> - unsigned long nr_pdxs = mfn_to_pdx(mfn_add(maddr_to_mfn(aligned_pe), -1)) -
> - mfn_to_pdx(maddr_to_mfn(aligned_ps)) + 1;
> - unsigned long frametable_size = nr_pdxs * sizeof(struct page_info);
>
> /*
> * The size of paddr_t should be sufficient for the complete range of
> @@ -204,11 +203,13 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>
> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
> + nr_pdxs = max_pdx - frametable_base_pdx;
> + frametable_size = nr_pdxs * sizeof(struct page_info);
> +
> if ( frametable_size > FRAMETABLE_SIZE )
> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
> - ps, pe);
> + panic("Frametable too small\n");
>
> - frametable_base_pdx = paddr_to_pdx(aligned_ps);
> frametable_size = ROUNDUP(frametable_size, PAGE_SIZE);
>
> base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 1);
> --
> 2.43.0
>
On 17/04/2026 13:55, Luca Fancellu wrote:
> HI Michal,
>
>> On 17 Apr 2026, at 10:11, Michal Orzel <michal.orzel@amd.com> wrote:
>>
>> Refactor setup_frametable_mappings() into init_frametable(), modeled
>> after x86's implementation. Instead of mapping one contiguous frametable
>> covering ram_start to ram_end (including holes), iterate the
>> pdx_group_valid bitmap to allocate and map frametable memory only for
>> valid PDX groups, skipping gaps in the physical address space. At the
>> moment we don't really take into account pdx_group_valid bitmap.
>>
>> This reduces memory consumption on systems with sparse RAM layouts by
>> not allocating frametable entries for non-existent memory regions.
>>
>> A file-local pdx_to_page() override is needed because the generic macro
>> in xen/include/xen/pdx.h does not account for ARM's non-zero
>> frametable_base_pdx.
>>
>> Update the MPU implementation to match the new init_frametable()
>> signature. Since MPU has no virtual address translation (ma == va),
>> hole-skipping is not possible and the frametable remains a single
>> contiguous allocation.
>>
>> Signed-off-by: Michal Orzel <michal.orzel@amd.com>
>> ---
>> We've been using this approach at AMD for a while now. Without this we would not
>> be able to boot some of our boards that have huge holes in the PA space, so I
>> consider this patch a great improvement.
>>
>> Two things to consider as a follow-up in the future:
>> - change generic pdx_to_page, page_to_pdx to take into account offset that
>> on x86 is zero but on other arches it is not. The page list code is
>> for now unaffected because the offset cancels out,
>> - use the same on RISCV.
>> ---
>> xen/arch/arm/arm32/mmu/mm.c | 3 +-
>> xen/arch/arm/include/asm/mm.h | 4 +-
>> xen/arch/arm/mm.c | 2 +-
>> xen/arch/arm/mmu/mm.c | 77 ++++++++++++++++++++++++-----------
>> xen/arch/arm/mpu/mm.c | 23 ++++++-----
>> 5 files changed, 70 insertions(+), 39 deletions(-)
>>
>> diff --git a/xen/arch/arm/arm32/mmu/mm.c b/xen/arch/arm/arm32/mmu/mm.c
>> index 5e4766ddcf65..0b595baa11b3 100644
>> --- a/xen/arch/arm/arm32/mmu/mm.c
>> +++ b/xen/arch/arm/arm32/mmu/mm.c
>> @@ -178,8 +178,7 @@ void __init setup_mm(void)
>>
>> setup_directmap_mappings(mfn_x(directmap_mfn_start), xenheap_pages);
>>
>> - /* Frame table covers all of RAM region, including holes */
>> - setup_frametable_mappings(ram_start, ram_end);
>> + init_frametable(ram_start);
>>
>> /*
>> * The allocators may need to use map_domain_page() (such as for
>> diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
>> index 72a692862420..2eb8465aa904 100644
>> --- a/xen/arch/arm/include/asm/mm.h
>> +++ b/xen/arch/arm/include/asm/mm.h
>> @@ -196,8 +196,8 @@ extern void *early_fdt_map(paddr_t fdt_paddr);
>> extern void remove_early_mappings(void);
>> /* Prepare the memory subystem to bring-up the given secondary CPU */
>> extern int prepare_secondary_mm(int cpu);
>> -/* Map a frame table to cover physical addresses ps through pe */
>> -extern void setup_frametable_mappings(paddr_t ps, paddr_t pe);
>> +/* Map a frame table */
>
> NIT: Would /* Initialize the frame table */ fit better the new helper description?
>
>> +void init_frametable(paddr_t ram_start);
>> /* Helper function to setup memory management */
>> void setup_mm_helper(void);
>> /* map a physical range in virtual memory */
>> diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
>> index 6df8b616e464..e6b651956927 100644
>> --- a/xen/arch/arm/mm.c
>> +++ b/xen/arch/arm/mm.c
>> @@ -62,7 +62,7 @@ void __init setup_mm(void)
>>
>> setup_mm_helper();
>>
>> - setup_frametable_mappings(ram_start, ram_end);
>> + init_frametable(ram_start);
>>
>> init_staticmem_pages();
>> init_sharedmem_pages();
>> diff --git a/xen/arch/arm/mmu/mm.c b/xen/arch/arm/mmu/mm.c
>> index 6604f3bf4e6a..4b4da349c16c 100644
>> --- a/xen/arch/arm/mmu/mm.c
>> +++ b/xen/arch/arm/mmu/mm.c
>> @@ -8,16 +8,37 @@
>> #include <xen/pdx.h>
>> #include <xen/string.h>
>>
>> -/* Map a frame table to cover physical addresses ps through pe */
>> -void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
>> +#undef pdx_to_page
>> +#define pdx_to_page(pdx) gcc11_wrap(frame_table + ((pdx) - frametable_base_pdx))
>> +
>> +static void __init
>> +init_frametable_chunk(unsigned long pdx_s, unsigned long pdx_e)
>> {
>> - unsigned long nr_pdxs = mfn_to_pdx(mfn_add(maddr_to_mfn(pe), -1)) -
>> - mfn_to_pdx(maddr_to_mfn(ps)) + 1;
>> - unsigned long frametable_size = nr_pdxs * sizeof(struct page_info);
>> - mfn_t base_mfn;
>> - const unsigned long mapping_size = frametable_size < MB(32) ? MB(2)
>> - : MB(32);
>> + unsigned long nr_pdxs = pdx_e - pdx_s;
>> + unsigned long chunk_size = nr_pdxs * sizeof(struct page_info);
>> + const unsigned long mapping_size = chunk_size < MB(32) ? MB(2) : MB(32);
>> + unsigned long virt;
>> int rc;
>> + mfn_t base_mfn;
>> +
>> + /* Round up to 2M or 32M boundary, as appropriate. */
>> + chunk_size = ROUNDUP(chunk_size, mapping_size);
>> + base_mfn = alloc_boot_pages(chunk_size >> PAGE_SHIFT, 32 << (20 - 12));
>> +
>> + virt = (unsigned long)pdx_to_page(pdx_s);
>> + rc = map_pages_to_xen(virt, base_mfn, chunk_size >> PAGE_SHIFT,
>> + PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
>> + if ( rc )
>> + panic("Unable to setup the frametable mappings\n");
>> +
>> + memset(pdx_to_page(pdx_s), 0, nr_pdxs * sizeof(struct page_info));
>> + memset(pdx_to_page(pdx_e), -1,
>> + chunk_size - nr_pdxs * sizeof(struct page_info));
>> +}
>> +
>> +void __init init_frametable(paddr_t ram_start)
>> +{
>> + unsigned int sidx, nidx, max_idx;
>>
>> /*
>> * The size of paddr_t should be sufficient for the complete range of
>> @@ -26,24 +47,34 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
>> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
>> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>>
>> - if ( frametable_size > FRAMETABLE_SIZE )
>> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
>> - ps, pe);
>> + max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT);
>> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
>>
>> - frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ps));
>> - /* Round up to 2M or 32M boundary, as appropriate. */
>> - frametable_size = ROUNDUP(frametable_size, mapping_size);
>> - base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12));
>> + /*
>> + * pdx_to_page(pdx_s) in init_frametable_chunk must be page-aligned
>> + * for map_pages_to_xen(). Aligning to PDX_GROUP_COUNT guarantees this
>> + * because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
>> + * PAGE_SIZE by construction.
>> + */
>> + frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
>
> We are now rounding down frametable_base_pdx which before this patch it was the start of the ram,
> but in xen/xen/arch/arm/include/asm/mm.h, mfn_valid(mfn) is using frametable_base_pdx to check for
> mfn validity, this means that we could pass an mfn before the start of the ram and if __mfn_valid is happy,
> we are getting a regression.
>
> Can this happen or am I missing something?
mfn_valid() can indeed return true for an MFN below ram_start that falls
in the same PDX group, but this is safe. init_frametable_chunk() maps
and zeroes the frametable for that range, so mfn_to_page() won't fault.
The zeroed page_info has count_info == 0 and no owner, so any get_page()
call on it will fail — the page is effectively inert.
~Michal
>
>>
>> - rc = map_pages_to_xen(FRAMETABLE_VIRT_START, base_mfn,
>> - frametable_size >> PAGE_SHIFT,
>> - PAGE_HYPERVISOR_RW | _PAGE_BLOCK);
>> - if ( rc )
>> - panic("Unable to setup the frametable mappings.\n");
>> + if ( (max_pdx - frametable_base_pdx) > FRAMETABLE_NR )
>> + panic("Frametable too small\n");
>> +
>> + for ( sidx = (frametable_base_pdx / PDX_GROUP_COUNT); ; sidx = nidx )
>> + {
>> + unsigned int eidx;
>> +
>> + eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
>> + nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
>> +
>> + if ( nidx >= max_idx )
>> + break;
>> +
>> + init_frametable_chunk(sidx * PDX_GROUP_COUNT, eidx * PDX_GROUP_COUNT);
>> + }
>>
>> - memset(&frame_table[0], 0, nr_pdxs * sizeof(struct page_info));
>> - memset(&frame_table[nr_pdxs], -1,
>> - frametable_size - (nr_pdxs * sizeof(struct page_info)));
>> + init_frametable_chunk(sidx * PDX_GROUP_COUNT, max_pdx);
>> }
>>
>> /*
>> diff --git a/xen/arch/arm/mpu/mm.c b/xen/arch/arm/mpu/mm.c
>> index aff88bd3a9c1..9c568831c128 100644
>> --- a/xen/arch/arm/mpu/mm.c
>> +++ b/xen/arch/arm/mpu/mm.c
>> @@ -186,16 +186,15 @@ static int is_mm_attr_match(pr_t *region, unsigned int attributes)
>> return 0;
>> }
>>
>> -/* Map a frame table to cover physical addresses ps through pe */
>> -void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
>> +/*
>> + * Allocate a contiguous frame table covering ram_start through max_pdx.
>> + * Unlike the MMU version, MPU cannot skip holes because there is no virtual
>> + * address translation (ma == va).
>> + */
>> +void __init init_frametable(paddr_t ram_start)
>> {
>> + unsigned long nr_pdxs, frametable_size;
>> mfn_t base_mfn;
>> - paddr_t aligned_ps = ROUNDUP(ps, PAGE_SIZE);
>> - paddr_t aligned_pe = ROUNDDOWN(pe, PAGE_SIZE);
>> -
>> - unsigned long nr_pdxs = mfn_to_pdx(mfn_add(maddr_to_mfn(aligned_pe), -1)) -
>> - mfn_to_pdx(maddr_to_mfn(aligned_ps)) + 1;
>> - unsigned long frametable_size = nr_pdxs * sizeof(struct page_info);
>>
>> /*
>> * The size of paddr_t should be sufficient for the complete range of
>> @@ -204,11 +203,13 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
>> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
>> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>>
>> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
>> + nr_pdxs = max_pdx - frametable_base_pdx;
>> + frametable_size = nr_pdxs * sizeof(struct page_info);
>> +
>> if ( frametable_size > FRAMETABLE_SIZE )
>> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
>> - ps, pe);
>> + panic("Frametable too small\n");
>>
>> - frametable_base_pdx = paddr_to_pdx(aligned_ps);
>> frametable_size = ROUNDUP(frametable_size, PAGE_SIZE);
>>
>> base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 1);
>> --
>> 2.43.0
>>
>
HI Michal,
>>> +void __init init_frametable(paddr_t ram_start)
>>> +{
>>> + unsigned int sidx, nidx, max_idx;
>>>
>>> /*
>>> * The size of paddr_t should be sufficient for the complete range of
>>> @@ -26,24 +47,34 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
>>> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
>>> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>>>
>>> - if ( frametable_size > FRAMETABLE_SIZE )
>>> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
>>> - ps, pe);
>>> + max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT);
>>> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
>>>
>>> - frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ps));
>>> - /* Round up to 2M or 32M boundary, as appropriate. */
>>> - frametable_size = ROUNDUP(frametable_size, mapping_size);
>>> - base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12));
>>> + /*
>>> + * pdx_to_page(pdx_s) in init_frametable_chunk must be page-aligned
>>> + * for map_pages_to_xen(). Aligning to PDX_GROUP_COUNT guarantees this
>>> + * because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
>>> + * PAGE_SIZE by construction.
>>> + */
>>> + frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
>>
>> We are now rounding down frametable_base_pdx which before this patch it was the start of the ram,
>> but in xen/xen/arch/arm/include/asm/mm.h, mfn_valid(mfn) is using frametable_base_pdx to check for
>> mfn validity, this means that we could pass an mfn before the start of the ram and if __mfn_valid is happy,
>> we are getting a regression.
>>
>> Can this happen or am I missing something?
> mfn_valid() can indeed return true for an MFN below ram_start that falls
> in the same PDX group, but this is safe. init_frametable_chunk() maps
> and zeroes the frametable for that range, so mfn_to_page() won't fault.
> The zeroed page_info has count_info == 0 and no owner, so any get_page()
> call on it will fail — the page is effectively inert.
Yes, I’ve checked and many path relying on mfn_valid() go also through mfn_to_page()
and/or get_page(), there is only one in process_shm() that potentially could add a shared memory page
given that we are relaxing mfn_valid now.
I’m trying also to follow is_iomem_page(), to check if subsequent mfn_to_page() fail safely, but I think that depending
on that (mfn_valid) the page will be only treated differently, not sure if it’s a latent bug to leave mfn_valid() as it is.
Would it be valid to have something like mfn_to_page() != 0 to be part of mfn_valid() to ensure it’s a real host ram page?
I’m truly asking here because I didn’t check if it’s doable.
Otherwise we could split the round down and the frametable_base_pdx in this way maybe?
diff --git a/xen/arch/arm/include/asm/mm.h b/xen/arch/arm/include/asm/mm.h
index 605ec1bcd108..a6802d331178 100644
--- a/xen/arch/arm/include/asm/mm.h
+++ b/xen/arch/arm/include/asm/mm.h
@@ -183,6 +183,7 @@ struct page_info
/* PDX of the first page in the frame table. */
extern unsigned long frametable_base_pdx;
+extern unsigned long frametable_base_grp;
#define PDX_GROUP_SHIFT SECOND_SHIFT
@@ -228,9 +229,9 @@ static inline void __iomem *ioremap_wc(paddr_t start, size_t len)
/* Convert between machine frame numbers and page-info structures. */
#define mfn_to_page(mfn) \
- (frame_table + (mfn_to_pdx(mfn) - frametable_base_pdx))
+ (frame_table + (mfn_to_pdx(mfn) - frametable_base_grp))
#define page_to_mfn(pg) \
- pdx_to_mfn((unsigned long)((pg) - frame_table) + frametable_base_pdx)
+ pdx_to_mfn((unsigned long)((pg) - frame_table) + frametable_base_grp)
/* Convert between machine addresses and page-info structures. */
#define maddr_to_page(ma) mfn_to_page(maddr_to_mfn(ma))
diff --git a/xen/arch/arm/include/asm/mmu/mm.h b/xen/arch/arm/include/asm/mmu/mm.h
index 7f4d59137d0d..48a0e342307b 100644
--- a/xen/arch/arm/include/asm/mmu/mm.h
+++ b/xen/arch/arm/include/asm/mmu/mm.h
@@ -88,7 +88,7 @@ static inline struct page_info *virt_to_page(const void *v)
pdx = (va - XENHEAP_VIRT_START) >> PAGE_SHIFT;
pdx += mfn_to_pdx(directmap_mfn_start);
- return frame_table + pdx - frametable_base_pdx;
+ return frame_table + pdx - frametable_base_grp;
}
/*
diff --git a/xen/arch/arm/mm.c b/xen/arch/arm/mm.c
index e6b651956927..765896ec2c32 100644
--- a/xen/arch/arm/mm.c
+++ b/xen/arch/arm/mm.c
@@ -28,6 +28,8 @@
unsigned long frametable_base_pdx __read_mostly;
+unsigned long frametable_base_grp __read_mostly;
+
#if defined(CONFIG_ARM_64) || defined(CONFIG_MPU)
void __init setup_mm(void)
{
diff --git a/xen/arch/arm/mmu/mm.c b/xen/arch/arm/mmu/mm.c
index 4b4da349c16c..11c47c2d0e25 100644
--- a/xen/arch/arm/mmu/mm.c
+++ b/xen/arch/arm/mmu/mm.c
@@ -9,7 +9,7 @@
#include <xen/string.h>
#undef pdx_to_page
-#define pdx_to_page(pdx) gcc11_wrap(frame_table + ((pdx) - frametable_base_pdx))
+#define pdx_to_page(pdx) gcc11_wrap(frame_table + ((pdx) - frametable_base_grp))
static void __init
init_frametable_chunk(unsigned long pdx_s, unsigned long pdx_e)
@@ -56,12 +56,12 @@ void __init init_frametable(paddr_t ram_start)
* because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
* PAGE_SIZE by construction.
*/
- frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
+ frametable_base_grp = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
- if ( (max_pdx - frametable_base_pdx) > FRAMETABLE_NR )
+ if ( (max_pdx - frametable_base_grp) > FRAMETABLE_NR )
panic("Frametable too small\n");
- for ( sidx = (frametable_base_pdx / PDX_GROUP_COUNT); ; sidx = nidx )
+ for ( sidx = (frametable_base_grp / PDX_GROUP_COUNT); ; sidx = nidx )
{
unsigned int eidx;
diff --git a/xen/arch/arm/mpu/mm.c b/xen/arch/arm/mpu/mm.c
index bd7b02fd33b5..870bfd2be5e7 100644
--- a/xen/arch/arm/mpu/mm.c
+++ b/xen/arch/arm/mpu/mm.c
@@ -207,6 +207,7 @@ void __init init_frametable(paddr_t ram_start)
BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
+ frametable_base_grp = frametable_base_pdx;
nr_pdxs = max_pdx - frametable_base_pdx;
frametable_size = nr_pdxs * sizeof(struct page_info);
Not sure! Maybe another maintainer can give another opinion if I’m overthinking too much on mfn_valid()
Cheers,
Luca
On 20/04/2026 16:06, Luca Fancellu wrote:
> HI Michal,
>
>>>> +void __init init_frametable(paddr_t ram_start)
>>>> +{
>>>> + unsigned int sidx, nidx, max_idx;
>>>>
>>>> /*
>>>> * The size of paddr_t should be sufficient for the complete range of
>>>> @@ -26,24 +47,34 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
>>>> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
>>>> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>>>>
>>>> - if ( frametable_size > FRAMETABLE_SIZE )
>>>> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
>>>> - ps, pe);
>>>> + max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT);
>>>> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
>>>>
>>>> - frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ps));
>>>> - /* Round up to 2M or 32M boundary, as appropriate. */
>>>> - frametable_size = ROUNDUP(frametable_size, mapping_size);
>>>> - base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12));
>>>> + /*
>>>> + * pdx_to_page(pdx_s) in init_frametable_chunk must be page-aligned
>>>> + * for map_pages_to_xen(). Aligning to PDX_GROUP_COUNT guarantees this
>>>> + * because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
>>>> + * PAGE_SIZE by construction.
>>>> + */
>>>> + frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
>>>
>>> We are now rounding down frametable_base_pdx which before this patch it was the start of the ram,
>>> but in xen/xen/arch/arm/include/asm/mm.h, mfn_valid(mfn) is using frametable_base_pdx to check for
>>> mfn validity, this means that we could pass an mfn before the start of the ram and if __mfn_valid is happy,
>>> we are getting a regression.
>>>
>>> Can this happen or am I missing something?
>> mfn_valid() can indeed return true for an MFN below ram_start that falls
>> in the same PDX group, but this is safe. init_frametable_chunk() maps
>> and zeroes the frametable for that range, so mfn_to_page() won't fault.
>> The zeroed page_info has count_info == 0 and no owner, so any get_page()
>> call on it will fail — the page is effectively inert.
>
> Yes, I’ve checked and many path relying on mfn_valid() go also through mfn_to_page()
> and/or get_page(), there is only one in process_shm() that potentially could add a shared memory page
> given that we are relaxing mfn_valid now.
We are not relaxing mfn_valid now. It has never meant to mean RAM. It means the
MFN has a corresponding frame table entry with struct page_info. This is what
it's comment says.
Static shmem is safe for ghost MFN. In prepare_staticmem_pages() we have a check
for count_info that for ghost MFN is 0.
>
> I’m trying also to follow is_iomem_page(), to check if subsequent mfn_to_page() fail safely, but I think that depending
> on that (mfn_valid) the page will be only treated differently, not sure if it’s a latent bug to leave mfn_valid() as it is.\
On Arm, is_iomem_page() is just !mfn_valid(), so yes, ghost MFNs
would be classified as not IOMEM. But this is harmless because is_iomem_page()
on ARM is only called from grant table paths that operate on pages obrtained by
get_page(), so no Arm code can reach is_iomem_page with a ghost MFN.
>
> Would it be valid to have something like mfn_to_page() != 0 to be part of mfn_valid() to ensure it’s a real host ram page?
> I’m truly asking here because I didn’t check if it’s doable.
As already said, mfn_valid was never supposed to mean RAM page. It is just that
on ARM the two happened to coincide until this patch.
>
> Otherwise we could split the round down and the frametable_base_pdx in this way maybe?
I don't think we need this extra complexity for something that is safe and has
always been in use. We've never had mfn_valid == RAM guarantee.
~Michal
Hi Michal,
> On 21 Apr 2026, at 11:33, Orzel, Michal <Michal.Orzel@amd.com> wrote:
>
>
>
> On 20/04/2026 16:06, Luca Fancellu wrote:
>> HI Michal,
>>
>>>>> +void __init init_frametable(paddr_t ram_start)
>>>>> +{
>>>>> + unsigned int sidx, nidx, max_idx;
>>>>>
>>>>> /*
>>>>> * The size of paddr_t should be sufficient for the complete range of
>>>>> @@ -26,24 +47,34 @@ void __init setup_frametable_mappings(paddr_t ps, paddr_t pe)
>>>>> BUILD_BUG_ON((sizeof(paddr_t) * BITS_PER_BYTE) < PADDR_BITS);
>>>>> BUILD_BUG_ON(sizeof(struct page_info) != PAGE_INFO_SIZE);
>>>>>
>>>>> - if ( frametable_size > FRAMETABLE_SIZE )
>>>>> - panic("The frametable cannot cover the physical region %#"PRIpaddr" - %#"PRIpaddr"\n",
>>>>> - ps, pe);
>>>>> + max_idx = DIV_ROUND_UP(max_pdx, PDX_GROUP_COUNT);
>>>>> + frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ram_start));
>>>>>
>>>>> - frametable_base_pdx = mfn_to_pdx(maddr_to_mfn(ps));
>>>>> - /* Round up to 2M or 32M boundary, as appropriate. */
>>>>> - frametable_size = ROUNDUP(frametable_size, mapping_size);
>>>>> - base_mfn = alloc_boot_pages(frametable_size >> PAGE_SHIFT, 32<<(20-12));
>>>>> + /*
>>>>> + * pdx_to_page(pdx_s) in init_frametable_chunk must be page-aligned
>>>>> + * for map_pages_to_xen(). Aligning to PDX_GROUP_COUNT guarantees this
>>>>> + * because PDX_GROUP_COUNT * sizeof(page_info) is always a multiple of
>>>>> + * PAGE_SIZE by construction.
>>>>> + */
>>>>> + frametable_base_pdx = ROUNDDOWN(frametable_base_pdx, PDX_GROUP_COUNT);
>>>>
>>>> We are now rounding down frametable_base_pdx which before this patch it was the start of the ram,
>>>> but in xen/xen/arch/arm/include/asm/mm.h, mfn_valid(mfn) is using frametable_base_pdx to check for
>>>> mfn validity, this means that we could pass an mfn before the start of the ram and if __mfn_valid is happy,
>>>> we are getting a regression.
>>>>
>>>> Can this happen or am I missing something?
>>> mfn_valid() can indeed return true for an MFN below ram_start that falls
>>> in the same PDX group, but this is safe. init_frametable_chunk() maps
>>> and zeroes the frametable for that range, so mfn_to_page() won't fault.
>>> The zeroed page_info has count_info == 0 and no owner, so any get_page()
>>> call on it will fail — the page is effectively inert.
>>
>> Yes, I’ve checked and many path relying on mfn_valid() go also through mfn_to_page()
>> and/or get_page(), there is only one in process_shm() that potentially could add a shared memory page
>> given that we are relaxing mfn_valid now.
> We are not relaxing mfn_valid now. It has never meant to mean RAM. It means the
> MFN has a corresponding frame table entry with struct page_info. This is what
> it's comment says.
Thanks for the clarification, I was under the impression that on Arm it signified mfn points to a real host ram
portion, the comment above Arm implementation doesn’t help either :)
>
> Static shmem is safe for ghost MFN. In prepare_staticmem_pages() we have a check
> for count_info that for ghost MFN is 0.
>
>>
>> I’m trying also to follow is_iomem_page(), to check if subsequent mfn_to_page() fail safely, but I think that depending
>> on that (mfn_valid) the page will be only treated differently, not sure if it’s a latent bug to leave mfn_valid() as it is.\
> On Arm, is_iomem_page() is just !mfn_valid(), so yes, ghost MFNs
> would be classified as not IOMEM. But this is harmless because is_iomem_page()
> on ARM is only called from grant table paths that operate on pages obrtained by
> get_page(), so no Arm code can reach is_iomem_page with a ghost MFN.
>
>>
>> Would it be valid to have something like mfn_to_page() != 0 to be part of mfn_valid() to ensure it’s a real host ram page?
>> I’m truly asking here because I didn’t check if it’s doable.
> As already said, mfn_valid was never supposed to mean RAM page. It is just that
> on ARM the two happened to coincide until this patch.
>
>>
>> Otherwise we could split the round down and the frametable_base_pdx in this way maybe?
> I don't think we need this extra complexity for something that is safe and has
> always been in use. We've never had mfn_valid == RAM guarantee.
Thanks for checking my points, I think it’s all clear to me now.
Reviewed-by: Luca Fancellu <luca.fancellu@arm.com>
Cheers,
Luca
On Tue, 21 Apr 2026, Luca Fancellu wrote: > >> Otherwise we could split the round down and the frametable_base_pdx in this way maybe? > > I don't think we need this extra complexity for something that is safe and has > > always been in use. We've never had mfn_valid == RAM guarantee. > > Thanks for checking my points, I think it’s all clear to me now. > > Reviewed-by: Luca Fancellu <luca.fancellu@arm.com> Acked-by: Stefano Stabellini <sstabellini@kernel.org>
© 2016 - 2026 Red Hat, Inc.