[v2] xen/riscv: introduce p2m functionality

[PATCH v2 12/17] xen/riscv: Implement p2m_free_entry() and related helpers

Posted by Oleksii Kurochko 4 months, 3 weeks ago

This patch introduces a working implementation of p2m_free_entry() for RISC-V
based on ARM's implementation of p2m_free_entry(), enabling proper cleanup
of page table entries in the P2M (physical-to-machine) mapping.

Only few things are changed:
- Use p2m_force_flush_sync() instead of p2m_tlb_flush_sync() as latter
  isn't implemented on RISC-V.
- Introduce and use p2m_type_radix_get() to get a type of p2m entry as
  RISC-V's PTE doesn't have enough space to store all necessary types so
  a type is stored in a radix tree.

Key additions include:
- p2m_free_entry(): Recursively frees page table entries at all levels. It
  handles both regular and superpage mappings and ensures that TLB entries
  are flushed before freeing intermediate tables.
- p2m_put_page() and helpers:
  - p2m_put_4k_page(): Clears GFN from xenheap pages if applicable.
  - p2m_put_2m_superpage(): Releases foreign page references in a 2MB
    superpage.
  - p2m_type_radix_get(): Extracts the stored p2m_type from the radix tree
    using the PTE.
- p2m_free_page(): Returns a page either to the domain's freelist or to
  the domheap, depending on whether the domain is hardware-backed.

Defines XEN_PT_ENTRIES in asm/page.h to simplify loops over page table
entries.

Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
---
Changes in V2:
 - New patch. It was a part of a big patch "xen/riscv: implement p2m mapping
   functionality" which was splitted to smaller.
 - s/p2m_is_superpage/p2me_is_superpage.
---
 xen/arch/riscv/include/asm/page.h |   1 +
 xen/arch/riscv/p2m.c              | 144 +++++++++++++++++++++++++++++-
 2 files changed, 142 insertions(+), 3 deletions(-)

diff --git a/xen/arch/riscv/include/asm/page.h b/xen/arch/riscv/include/asm/page.h
index 1b8b145663..c67b9578c9 100644
--- a/xen/arch/riscv/include/asm/page.h
+++ b/xen/arch/riscv/include/asm/page.h
@@ -22,6 +22,7 @@
 #define XEN_PT_LEVEL_SIZE(lvl)      (_AT(paddr_t, 1) << XEN_PT_LEVEL_SHIFT(lvl))
 #define XEN_PT_LEVEL_MAP_MASK(lvl)  (~(XEN_PT_LEVEL_SIZE(lvl) - 1))
 #define XEN_PT_LEVEL_MASK(lvl)      (VPN_MASK << XEN_PT_LEVEL_SHIFT(lvl))
+#define XEN_PT_ENTRIES              (_AT(unsigned int, 1) << PAGETABLE_ORDER)
 
 /*
  * PTE format:
diff --git a/xen/arch/riscv/p2m.c b/xen/arch/riscv/p2m.c
index 27499a86bb..6b11e87b22 100644
--- a/xen/arch/riscv/p2m.c
+++ b/xen/arch/riscv/p2m.c
@@ -345,11 +345,33 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
     return __map_domain_page(p2m->root + root_table_indx);
 }
 
+static p2m_type_t p2m_type_radix_get(struct p2m_domain *p2m, pte_t pte)
+{
+    void *ptr;
+    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
+
+    ptr = radix_tree_lookup(&p2m->p2m_type, gfn_x(gfn));
+
+    if ( !ptr )
+        return p2m_invalid;
+
+    return radix_tree_ptr_to_int(ptr);
+}
+
+/*
+ * In the case of the P2M, the valid bit is used for other purpose. Use
+ * the type to check whether an entry is valid.
+ */
 static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
 {
-    panic("%s: isn't implemented for now\n", __func__);
+    return p2m_type_radix_get(p2m, pte) != p2m_invalid;
+}
 
-    return false;
+static inline bool p2me_is_superpage(struct p2m_domain *p2m, pte_t pte,
+                                    unsigned int level)
+{
+    return p2me_is_valid(p2m, pte) && (pte.pte & PTE_ACCESS_MASK) &&
+           (level > 0);
 }
 
 static inline void p2m_write_pte(pte_t *p, pte_t pte, bool clean_pte)
@@ -404,11 +426,127 @@ static int p2m_next_level(struct p2m_domain *p2m, bool alloc_tbl,
     return GUEST_TABLE_MAP_NONE;
 }
 
+static void p2m_put_foreign_page(struct page_info *pg)
+{
+    /*
+     * It's safe to do the put_page here because page_alloc will
+     * flush the TLBs if the page is reallocated before the end of
+     * this loop.
+     */
+    put_page(pg);
+}
+
+/* Put any references on the single 4K page referenced by mfn. */
+static void p2m_put_4k_page(mfn_t mfn, p2m_type_t type)
+{
+    /* TODO: Handle other p2m types */
+
+    /* Detect the xenheap page and mark the stored GFN as invalid. */
+    if ( p2m_is_ram(type) && is_xen_heap_mfn(mfn) )
+        page_set_xenheap_gfn(mfn_to_page(mfn), INVALID_GFN);
+}
+
+/* Put any references on the superpage referenced by mfn. */
+static void p2m_put_2m_superpage(mfn_t mfn, p2m_type_t type)
+{
+    struct page_info *pg;
+    unsigned int i;
+
+    ASSERT(mfn_valid(mfn));
+
+    pg = mfn_to_page(mfn);
+
+    for ( i = 0; i < XEN_PT_ENTRIES; i++, pg++ )
+        p2m_put_foreign_page(pg);
+}
+
+/* Put any references on the page referenced by pte. */
+static void p2m_put_page(struct p2m_domain *p2m, const pte_t pte,
+                         unsigned int level)
+{
+    mfn_t mfn = pte_get_mfn(pte);
+    p2m_type_t p2m_type = p2m_type_radix_get(p2m, pte);
+
+    ASSERT(p2me_is_valid(p2m, pte));
+
+    /*
+     * TODO: Currently we don't handle level 2 super-page, Xen is not
+     * preemptible and therefore some work is needed to handle such
+     * superpages, for which at some point Xen might end up freeing memory
+     * and therefore for such a big mapping it could end up in a very long
+     * operation.
+     */
+    if ( level == 1 )
+        return p2m_put_2m_superpage(mfn, p2m_type);
+    else if ( level == 0 )
+        return p2m_put_4k_page(mfn, p2m_type);
+}
+
+static void p2m_free_page(struct domain *d, struct page_info *pg)
+{
+    if ( is_hardware_domain(d) )
+        free_domheap_page(pg);
+    else
+    {
+        spin_lock(&d->arch.paging.lock);
+        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
+        spin_unlock(&d->arch.paging.lock);
+    }
+}
+
 /* Free pte sub-tree behind an entry */
 static void p2m_free_entry(struct p2m_domain *p2m,
                            pte_t entry, unsigned int level)
 {
-    panic("%s: hasn't been implemented yet\n", __func__);
+    unsigned int i;
+    pte_t *table;
+    mfn_t mfn;
+    struct page_info *pg;
+
+    /* Nothing to do if the entry is invalid. */
+    if ( !p2me_is_valid(p2m, entry) )
+        return;
+
+    if ( p2me_is_superpage(p2m, entry, level) || (level == 0) )
+    {
+#ifdef CONFIG_IOREQ_SERVER
+        /*
+         * If this gets called then either the entry was replaced by an entry
+         * with a different base (valid case) or the shattering of a superpage
+         * has failed (error case).
+         * So, at worst, the spurious mapcache invalidation might be sent.
+         */
+        if ( p2m_is_ram( p2m_type_radix_get(p2m, entry)) &&
+             domain_has_ioreq_server(p2m->domain) )
+            ioreq_request_mapcache_invalidate(p2m->domain);
+#endif
+
+        p2m_put_page(p2m, entry, level);
+
+        return;
+    }
+
+    table = map_domain_page(pte_get_mfn(entry));
+    for ( i = 0; i < XEN_PT_ENTRIES; i++ )
+        p2m_free_entry(p2m, *(table + i), level - 1);
+
+    unmap_domain_page(table);
+
+    /*
+     * Make sure all the references in the TLB have been removed before
+     * freing the intermediate page table.
+     * XXX: Should we defer the free of the page table to avoid the
+     * flush?
+     */
+    p2m_force_tlb_flush_sync(p2m);
+
+    mfn = pte_get_mfn(entry);
+    ASSERT(mfn_valid(mfn));
+
+    pg = mfn_to_page(mfn);
+
+    page_list_del(pg, &p2m->pages);
+    p2m_free_page(p2m->domain, pg);
 }
 
 /*
-- 
2.49.0

Re: [PATCH v2 12/17] xen/riscv: Implement p2m_free_entry() and related helpers

Posted by Jan Beulich 4 months ago

On 10.06.2025 15:05, Oleksii Kurochko wrote:
> This patch introduces a working implementation of p2m_free_entry() for RISC-V
> based on ARM's implementation of p2m_free_entry(), enabling proper cleanup
> of page table entries in the P2M (physical-to-machine) mapping.
> 
> Only few things are changed:
> - Use p2m_force_flush_sync() instead of p2m_tlb_flush_sync() as latter
>   isn't implemented on RISC-V.
> - Introduce and use p2m_type_radix_get() to get a type of p2m entry as
>   RISC-V's PTE doesn't have enough space to store all necessary types so
>   a type is stored in a radix tree.
> 
> Key additions include:
> - p2m_free_entry(): Recursively frees page table entries at all levels. It
>   handles both regular and superpage mappings and ensures that TLB entries
>   are flushed before freeing intermediate tables.
> - p2m_put_page() and helpers:
>   - p2m_put_4k_page(): Clears GFN from xenheap pages if applicable.
>   - p2m_put_2m_superpage(): Releases foreign page references in a 2MB
>     superpage.
>   - p2m_type_radix_get(): Extracts the stored p2m_type from the radix tree
>     using the PTE.
> - p2m_free_page(): Returns a page either to the domain's freelist or to
>   the domheap, depending on whether the domain is hardware-backed.

What is "hardware-backed"?

> Defines XEN_PT_ENTRIES in asm/page.h to simplify loops over page table
> entries.
> 
> Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
> ---
> Changes in V2:
>  - New patch. It was a part of a big patch "xen/riscv: implement p2m mapping
>    functionality" which was splitted to smaller.
>  - s/p2m_is_superpage/p2me_is_superpage.

See my earlier comments regarding naming.

> --- a/xen/arch/riscv/p2m.c
> +++ b/xen/arch/riscv/p2m.c
> @@ -345,11 +345,33 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>      return __map_domain_page(p2m->root + root_table_indx);
>  }
>  
> +static p2m_type_t p2m_type_radix_get(struct p2m_domain *p2m, pte_t pte)

Does it matter to callers that ...

> +{
> +    void *ptr;
> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
> +
> +    ptr = radix_tree_lookup(&p2m->p2m_type, gfn_x(gfn));
> +
> +    if ( !ptr )
> +        return p2m_invalid;
> +
> +    return radix_tree_ptr_to_int(ptr);
> +}

... this is a radix tree lookup? IOW does "radix" need to be part of the
function name? Also "get" may want to move forward in the name, to better
match the naming of other functions.

> +/*
> + * In the case of the P2M, the valid bit is used for other purpose. Use
> + * the type to check whether an entry is valid.
> + */
>  static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>  {
> -    panic("%s: isn't implemented for now\n", __func__);
> +    return p2m_type_radix_get(p2m, pte) != p2m_invalid;
> +}

No checking of the valid bit?

> -    return false;
> +static inline bool p2me_is_superpage(struct p2m_domain *p2m, pte_t pte,
> +                                    unsigned int level)
> +{
> +    return p2me_is_valid(p2m, pte) && (pte.pte & PTE_ACCESS_MASK) &&
> +           (level > 0);

In such combinations of conditions it's usually helpful to put the
cheapest check(s) first. IOW what point is there in doing a radix
tree lookup when the other two conditions aren't satisfied? (FTAOD
applies elsewhere as well, even within this same patch.)

> @@ -404,11 +426,127 @@ static int p2m_next_level(struct p2m_domain *p2m, bool alloc_tbl,
>      return GUEST_TABLE_MAP_NONE;
>  }
>  
> +static void p2m_put_foreign_page(struct page_info *pg)
> +{
> +    /*
> +     * It's safe to do the put_page here because page_alloc will
> +     * flush the TLBs if the page is reallocated before the end of
> +     * this loop.
> +     */
> +    put_page(pg);

Is the comment really true? The page allocator will flush the normal
TLBs, but not the stage-2 ones. Yet those are what you care about here,
aiui.

> +/* Put any references on the single 4K page referenced by mfn. */
> +static void p2m_put_4k_page(mfn_t mfn, p2m_type_t type)
> +{
> +    /* TODO: Handle other p2m types */
> +
> +    /* Detect the xenheap page and mark the stored GFN as invalid. */
> +    if ( p2m_is_ram(type) && is_xen_heap_mfn(mfn) )
> +        page_set_xenheap_gfn(mfn_to_page(mfn), INVALID_GFN);

Is this a valid thing to do? How do you make sure the respective uses
(in gnttab's shared and status page arrays) are / were also removed?

> +}
> +
> +/* Put any references on the superpage referenced by mfn. */
> +static void p2m_put_2m_superpage(mfn_t mfn, p2m_type_t type)
> +{
> +    struct page_info *pg;
> +    unsigned int i;
> +
> +    ASSERT(mfn_valid(mfn));
> +
> +    pg = mfn_to_page(mfn);
> +
> +    for ( i = 0; i < XEN_PT_ENTRIES; i++, pg++ )
> +        p2m_put_foreign_page(pg);
> +}
> +
> +/* Put any references on the page referenced by pte. */
> +static void p2m_put_page(struct p2m_domain *p2m, const pte_t pte,
> +                         unsigned int level)
> +{
> +    mfn_t mfn = pte_get_mfn(pte);
> +    p2m_type_t p2m_type = p2m_type_radix_get(p2m, pte);

This gives you the type of the 1st page. What guarantees that all other pages
in a superpage are of the exact same type?

> +    ASSERT(p2me_is_valid(p2m, pte));
> +
> +    /*
> +     * TODO: Currently we don't handle level 2 super-page, Xen is not
> +     * preemptible and therefore some work is needed to handle such
> +     * superpages, for which at some point Xen might end up freeing memory
> +     * and therefore for such a big mapping it could end up in a very long
> +     * operation.
> +     */

This is pretty unsatisfactory. Imo, if you don't deal with that right away,
you're setting yourself up for a significant re-write.

> +    if ( level == 1 )
> +        return p2m_put_2m_superpage(mfn, p2m_type);
> +    else if ( level == 0 )
> +        return p2m_put_4k_page(mfn, p2m_type);

Use switch() right away?

> +}
> +
> +static void p2m_free_page(struct domain *d, struct page_info *pg)
> +{
> +    if ( is_hardware_domain(d) )
> +        free_domheap_page(pg);

Why's the hardware domain different here? It should have a pool just like
all other domains have.

> +    else
> +    {
> +        spin_lock(&d->arch.paging.lock);
> +        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
> +        spin_unlock(&d->arch.paging.lock);
> +    }
> +}
> +
>  /* Free pte sub-tree behind an entry */
>  static void p2m_free_entry(struct p2m_domain *p2m,
>                             pte_t entry, unsigned int level)
>  {
> -    panic("%s: hasn't been implemented yet\n", __func__);
> +    unsigned int i;
> +    pte_t *table;
> +    mfn_t mfn;
> +    struct page_info *pg;
> +
> +    /* Nothing to do if the entry is invalid. */
> +    if ( !p2me_is_valid(p2m, entry) )
> +        return;

Does this actually apply to intermediate page tables (which you handle
later in the function), when that's (only) a P2M type check?

> +    if ( p2me_is_superpage(p2m, entry, level) || (level == 0) )
> +    {
> +#ifdef CONFIG_IOREQ_SERVER
> +        /*
> +         * If this gets called then either the entry was replaced by an entry
> +         * with a different base (valid case) or the shattering of a superpage
> +         * has failed (error case).
> +         * So, at worst, the spurious mapcache invalidation might be sent.
> +         */
> +        if ( p2m_is_ram( p2m_type_radix_get(p2m, entry)) &&

Nit: Style.

> +             domain_has_ioreq_server(p2m->domain) )
> +            ioreq_request_mapcache_invalidate(p2m->domain);
> +#endif
> +
> +        p2m_put_page(p2m, entry, level);
> +
> +        return;
> +    }
> +
> +    table = map_domain_page(pte_get_mfn(entry));
> +    for ( i = 0; i < XEN_PT_ENTRIES; i++ )
> +        p2m_free_entry(p2m, *(table + i), level - 1);

Better table[i]?

Jan

Re: [PATCH v2 12/17] xen/riscv: Implement p2m_free_entry() and related helpers

Posted by Oleksii Kurochko 3 months, 3 weeks ago

On 7/1/25 4:23 PM, Jan Beulich wrote:
> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>> This patch introduces a working implementation of p2m_free_entry() for RISC-V
>> based on ARM's implementation of p2m_free_entry(), enabling proper cleanup
>> of page table entries in the P2M (physical-to-machine) mapping.
>>
>> Only few things are changed:
>> - Use p2m_force_flush_sync() instead of p2m_tlb_flush_sync() as latter
>>    isn't implemented on RISC-V.
>> - Introduce and use p2m_type_radix_get() to get a type of p2m entry as
>>    RISC-V's PTE doesn't have enough space to store all necessary types so
>>    a type is stored in a radix tree.
>>
>> Key additions include:
>> - p2m_free_entry(): Recursively frees page table entries at all levels. It
>>    handles both regular and superpage mappings and ensures that TLB entries
>>    are flushed before freeing intermediate tables.
>> - p2m_put_page() and helpers:
>>    - p2m_put_4k_page(): Clears GFN from xenheap pages if applicable.
>>    - p2m_put_2m_superpage(): Releases foreign page references in a 2MB
>>      superpage.
>>    - p2m_type_radix_get(): Extracts the stored p2m_type from the radix tree
>>      using the PTE.
>> - p2m_free_page(): Returns a page either to the domain's freelist or to
>>    the domheap, depending on whether the domain is hardware-backed.
> What is "hardware-backed"?

It means basically hardware domain, i.e. DOM0.

>> --- a/xen/arch/riscv/p2m.c
>> +++ b/xen/arch/riscv/p2m.c
>> @@ -345,11 +345,33 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>       return __map_domain_page(p2m->root + root_table_indx);
>>   }
>>   
>> +static p2m_type_t p2m_type_radix_get(struct p2m_domain *p2m, pte_t pte)
> Does it matter to callers that ...
>
>> +{
>> +    void *ptr;
>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>> +
>> +    ptr = radix_tree_lookup(&p2m->p2m_type, gfn_x(gfn));
>> +
>> +    if ( !ptr )
>> +        return p2m_invalid;
>> +
>> +    return radix_tree_ptr_to_int(ptr);
>> +}
> ... this is a radix tree lookup? IOW does "radix" need to be part of the
> function name? Also "get" may want to move forward in the name, to better
> match the naming of other functions.

Agree, it doesn't really matter, so I will rename it.

>> +/*
>> + * In the case of the P2M, the valid bit is used for other purpose. Use
>> + * the type to check whether an entry is valid.
>> + */
>>   static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>   {
>> -    panic("%s: isn't implemented for now\n", __func__);
>> +    return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>> +}
> No checking of the valid bit?

As mentioned in the comment, only the P2M type should be checked, since the
valid bit is used for other purposes we discussed earlier, for example, to
track whether pages were accessed by a guest domain, or to support certain
table invalidation optimizations (1) and (2).
So, in this case, we only need to consider whether the entry is invalid
from the P2M perspective.

(1)https://github.com/xen-project/xen/blob/19772b67/xen/arch/arm/mmu/p2m.c#L1245
(2)https://github.com/xen-project/xen/blob/19772b67/xen/arch/arm/mmu/p2m.c#L1386

>> @@ -404,11 +426,127 @@ static int p2m_next_level(struct p2m_domain *p2m, bool alloc_tbl,
>>       return GUEST_TABLE_MAP_NONE;
>>   }
>>   
>> +static void p2m_put_foreign_page(struct page_info *pg)
>> +{
>> +    /*
>> +     * It's safe to do the put_page here because page_alloc will
>> +     * flush the TLBs if the page is reallocated before the end of
>> +     * this loop.
>> +     */
>> +    put_page(pg);
> Is the comment really true? The page allocator will flush the normal
> TLBs, but not the stage-2 ones. Yet those are what you care about here,
> aiui.

In alloc_heap_pages():
  ...
      if ( need_tlbflush )
         filtered_flush_tlb_mask(tlbflush_timestamp);
  ...
  
filtered_flush_tlb_mask() calls arch_flush_tlb_mask().

and arch_flush_tlb_mask(), at least, on Arm (I haven't checked x86) is
implented as:
   void arch_flush_tlb_mask(const cpumask_t *mask)
   {
       /* No need to IPI other processors on ARM, the processor takes care of it. */
       flush_all_guests_tlb();
   }

So it flushes stage-2 TLB.

>
>> +/* Put any references on the single 4K page referenced by mfn. */
>> +static void p2m_put_4k_page(mfn_t mfn, p2m_type_t type)
>> +{
>> +    /* TODO: Handle other p2m types */
>> +
>> +    /* Detect the xenheap page and mark the stored GFN as invalid. */
>> +    if ( p2m_is_ram(type) && is_xen_heap_mfn(mfn) )
>> +        page_set_xenheap_gfn(mfn_to_page(mfn), INVALID_GFN);
> Is this a valid thing to do? How do you make sure the respective uses
> (in gnttab's shared and status page arrays) are / were also removed?

As grant table frame GFN is stored directly in struct page_info instead
of keeping it in standalone status/shared arrays, thereby there is no need
for status/shared arrays.

>
>> +}
>> +
>> +/* Put any references on the superpage referenced by mfn. */
>> +static void p2m_put_2m_superpage(mfn_t mfn, p2m_type_t type)
>> +{
>> +    struct page_info *pg;
>> +    unsigned int i;
>> +
>> +    ASSERT(mfn_valid(mfn));
>> +
>> +    pg = mfn_to_page(mfn);
>> +
>> +    for ( i = 0; i < XEN_PT_ENTRIES; i++, pg++ )
>> +        p2m_put_foreign_page(pg);
>> +}
>> +
>> +/* Put any references on the page referenced by pte. */
>> +static void p2m_put_page(struct p2m_domain *p2m, const pte_t pte,
>> +                         unsigned int level)
>> +{
>> +    mfn_t mfn = pte_get_mfn(pte);
>> +    p2m_type_t p2m_type = p2m_type_radix_get(p2m, pte);
> This gives you the type of the 1st page. What guarantees that all other pages
> in a superpage are of the exact same type?

Doesn't superpage mean that all the 4KB pages within that superpage have the
same type and contiguous in memory?

>
>> +    ASSERT(p2me_is_valid(p2m, pte));
>> +
>> +    /*
>> +     * TODO: Currently we don't handle level 2 super-page, Xen is not
>> +     * preemptible and therefore some work is needed to handle such
>> +     * superpages, for which at some point Xen might end up freeing memory
>> +     * and therefore for such a big mapping it could end up in a very long
>> +     * operation.
>> +     */
> This is pretty unsatisfactory. Imo, if you don't deal with that right away,
> you're setting yourself up for a significant re-write.

ARM leaves with that for a long time and it seems like it isn't a big issue for it.
And considering that frametable supports only 4Kb page granularity such big mappings
could lead to long operations during memory freeing.
And 1gb mapping isn't used for

>
>> +    if ( level == 1 )
>> +        return p2m_put_2m_superpage(mfn, p2m_type);
>> +    else if ( level == 0 )
>> +        return p2m_put_4k_page(mfn, p2m_type);
> Use switch() right away?

It could be, I think that no big difference at the moment, at least.
But I am okay to rework it.

>
>> +}
>> +
>> +static void p2m_free_page(struct domain *d, struct page_info *pg)
>> +{
>> +    if ( is_hardware_domain(d) )
>> +        free_domheap_page(pg);
> Why's the hardware domain different here? It should have a pool just like
> all other domains have.

Hardware domain (dom0) should be no limit in the number of pages that can
be allocated, so allocate p2m pages for hardware domain is done from heap.

An idea of p2m pool is to provide a way how to put clear limit and amount
to the p2m allocation.

>
>> +    else
>> +    {
>> +        spin_lock(&d->arch.paging.lock);
>> +        page_list_add_tail(pg, &d->arch.paging.p2m_freelist);
>> +        spin_unlock(&d->arch.paging.lock);
>> +    }
>> +}
>> +
>>   /* Free pte sub-tree behind an entry */
>>   static void p2m_free_entry(struct p2m_domain *p2m,
>>                              pte_t entry, unsigned int level)
>>   {
>> -    panic("%s: hasn't been implemented yet\n", __func__);
>> +    unsigned int i;
>> +    pte_t *table;
>> +    mfn_t mfn;
>> +    struct page_info *pg;
>> +
>> +    /* Nothing to do if the entry is invalid. */
>> +    if ( !p2me_is_valid(p2m, entry) )
>> +        return;
> Does this actually apply to intermediate page tables (which you handle
> later in the function), when that's (only) a P2M type check?

Yes, any PTE should have V bit set to 1, so from P2M perspective it also
should be, at least, not equal to p2m_invalid.

~ Oleksii

Re: [PATCH v2 12/17] xen/riscv: Implement p2m_free_entry() and related helpers

Posted by Jan Beulich 3 months, 2 weeks ago

On 11.07.2025 17:56, Oleksii Kurochko wrote:
> On 7/1/25 4:23 PM, Jan Beulich wrote:
>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>> +/*
>>> + * In the case of the P2M, the valid bit is used for other purpose. Use
>>> + * the type to check whether an entry is valid.
>>> + */
>>>   static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>   {
>>> -    panic("%s: isn't implemented for now\n", __func__);
>>> +    return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>> +}
>> No checking of the valid bit?
> 
> As mentioned in the comment, only the P2M type should be checked, since the
> valid bit is used for other purposes we discussed earlier, for example, to
> track whether pages were accessed by a guest domain, or to support certain
> table invalidation optimizations (1) and (2).
> So, in this case, we only need to consider whether the entry is invalid
> from the P2M perspective.
> 
> (1)https://github.com/xen-project/xen/blob/19772b67/xen/arch/arm/mmu/p2m.c#L1245
> (2)https://github.com/xen-project/xen/blob/19772b67/xen/arch/arm/mmu/p2m.c#L1386

And there can be e.g. entries with the valid bit set and the type being
p2m_invalid? IOW there's no short-circuiting possible in any of the
possible cases, avoiding the radix tree lookup in at least some of the
cases?

>>> @@ -404,11 +426,127 @@ static int p2m_next_level(struct p2m_domain *p2m, bool alloc_tbl,
>>>       return GUEST_TABLE_MAP_NONE;
>>>   }
>>>   
>>> +static void p2m_put_foreign_page(struct page_info *pg)
>>> +{
>>> +    /*
>>> +     * It's safe to do the put_page here because page_alloc will
>>> +     * flush the TLBs if the page is reallocated before the end of
>>> +     * this loop.
>>> +     */
>>> +    put_page(pg);
>> Is the comment really true? The page allocator will flush the normal
>> TLBs, but not the stage-2 ones. Yet those are what you care about here,
>> aiui.
> 
> In alloc_heap_pages():
>   ...
>       if ( need_tlbflush )
>          filtered_flush_tlb_mask(tlbflush_timestamp);
>   ...
>   
> filtered_flush_tlb_mask() calls arch_flush_tlb_mask().
> 
> and arch_flush_tlb_mask(), at least, on Arm (I haven't checked x86) is
> implented as:
>    void arch_flush_tlb_mask(const cpumask_t *mask)
>    {
>        /* No need to IPI other processors on ARM, the processor takes care of it. */
>        flush_all_guests_tlb();
>    }
> 
> So it flushes stage-2 TLB.

Hmm, okay. And I take it you have the same plan on RISC-V? What I'd like to
ask for, though, is that the comment (also) mentions where that (guest)
flushing actually happens. That's not in page_alloc.c, and it also wasn't
originally intended for guest TLBs to also be flushed from there (as x86 is
where the flush avoidance machinery originates, which Arm and now also
RISC-V don't really use).

>>> +/* Put any references on the single 4K page referenced by mfn. */
>>> +static void p2m_put_4k_page(mfn_t mfn, p2m_type_t type)
>>> +{
>>> +    /* TODO: Handle other p2m types */
>>> +
>>> +    /* Detect the xenheap page and mark the stored GFN as invalid. */
>>> +    if ( p2m_is_ram(type) && is_xen_heap_mfn(mfn) )
>>> +        page_set_xenheap_gfn(mfn_to_page(mfn), INVALID_GFN);
>> Is this a valid thing to do? How do you make sure the respective uses
>> (in gnttab's shared and status page arrays) are / were also removed?
> 
> As grant table frame GFN is stored directly in struct page_info instead
> of keeping it in standalone status/shared arrays, thereby there is no need
> for status/shared arrays.

I fear I don't follow. Looking at Arm's header (which I understand you
derive from), I see

#define gnttab_shared_page(t, i)   virt_to_page((t)->shared_raw[i])

#define gnttab_status_page(t, i)   virt_to_page((t)->status[i])

Are you intending to do things differently?

>>> +/* Put any references on the superpage referenced by mfn. */
>>> +static void p2m_put_2m_superpage(mfn_t mfn, p2m_type_t type)
>>> +{
>>> +    struct page_info *pg;
>>> +    unsigned int i;
>>> +
>>> +    ASSERT(mfn_valid(mfn));
>>> +
>>> +    pg = mfn_to_page(mfn);
>>> +
>>> +    for ( i = 0; i < XEN_PT_ENTRIES; i++, pg++ )
>>> +        p2m_put_foreign_page(pg);
>>> +}
>>> +
>>> +/* Put any references on the page referenced by pte. */
>>> +static void p2m_put_page(struct p2m_domain *p2m, const pte_t pte,
>>> +                         unsigned int level)
>>> +{
>>> +    mfn_t mfn = pte_get_mfn(pte);
>>> +    p2m_type_t p2m_type = p2m_type_radix_get(p2m, pte);
>> This gives you the type of the 1st page. What guarantees that all other pages
>> in a superpage are of the exact same type?
> 
> Doesn't superpage mean that all the 4KB pages within that superpage have the
> same type and contiguous in memory?

If the mapping is a super-page one - yes. Yet I see nothing super-page-ish
here.

>>> +    if ( level == 1 )
>>> +        return p2m_put_2m_superpage(mfn, p2m_type);
>>> +    else if ( level == 0 )
>>> +        return p2m_put_4k_page(mfn, p2m_type);
>> Use switch() right away?
> 
> It could be, I think that no big difference at the moment, at least.
> But I am okay to rework it.

If you don't want to use switch() here, then my other style nit would
need giving: Please avoid "else" in situations like this.

>>> +static void p2m_free_page(struct domain *d, struct page_info *pg)
>>> +{
>>> +    if ( is_hardware_domain(d) )
>>> +        free_domheap_page(pg);
>> Why's the hardware domain different here? It should have a pool just like
>> all other domains have.
> 
> Hardware domain (dom0) should be no limit in the number of pages that can
> be allocated, so allocate p2m pages for hardware domain is done from heap.
> 
> An idea of p2m pool is to provide a way how to put clear limit and amount
> to the p2m allocation.

Well, we had been there on another thread, and I outlined how I think
Dom0 may want handling.

>>>   /* Free pte sub-tree behind an entry */
>>>   static void p2m_free_entry(struct p2m_domain *p2m,
>>>                              pte_t entry, unsigned int level)
>>>   {
>>> -    panic("%s: hasn't been implemented yet\n", __func__);
>>> +    unsigned int i;
>>> +    pte_t *table;
>>> +    mfn_t mfn;
>>> +    struct page_info *pg;
>>> +
>>> +    /* Nothing to do if the entry is invalid. */
>>> +    if ( !p2me_is_valid(p2m, entry) )
>>> +        return;
>> Does this actually apply to intermediate page tables (which you handle
>> later in the function), when that's (only) a P2M type check?
> 
> Yes, any PTE should have V bit set to 1, so from P2M perspective it also
> should be, at least, not equal to p2m_invalid.

I don't follow. Where would that type be set? The radix tree being GFN-
indexed, you would need to "invent" a GFN for every intermediate page table,
just to be able to (legitimately) invoke the type retrieval function. Maybe
you mean to leverage that (now, i.e. post-v2) you encode some of the types
directly in the PTE, and p2m_invalid may be one of them. But that wasn't
the case in the v2 submission, and hence the code looked wrong to me. Which
in turn suggests that at least some better commentary is going to be needed,
maybe even some BUILD_BUG_ON().

Jan

Re: [PATCH v2 12/17] xen/riscv: Implement p2m_free_entry() and related helpers

Posted by Oleksii Kurochko 3 months, 2 weeks ago

On 7/14/25 9:15 AM, Jan Beulich wrote:
> On 11.07.2025 17:56, Oleksii Kurochko wrote:
>> On 7/1/25 4:23 PM, Jan Beulich wrote:
>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>> +/*
>>>> + * In the case of the P2M, the valid bit is used for other purpose. Use
>>>> + * the type to check whether an entry is valid.
>>>> + */
>>>>    static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>    {
>>>> -    panic("%s: isn't implemented for now\n", __func__);
>>>> +    return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>> +}
>>> No checking of the valid bit?
>> As mentioned in the comment, only the P2M type should be checked, since the
>> valid bit is used for other purposes we discussed earlier, for example, to
>> track whether pages were accessed by a guest domain, or to support certain
>> table invalidation optimizations (1) and (2).
>> So, in this case, we only need to consider whether the entry is invalid
>> from the P2M perspective.
>>
>> (1)https://github.com/xen-project/xen/blob/19772b67/xen/arch/arm/mmu/p2m.c#L1245
>> (2)https://github.com/xen-project/xen/blob/19772b67/xen/arch/arm/mmu/p2m.c#L1386
> And there can be e.g. entries with the valid bit set and the type being
> p2m_invalid?

It shouldn't be so, at least, at the moment, I don't know such cases.

> IOW there's no short-circuiting possible in any of the
> possible cases, avoiding the radix tree lookup in at least some of the
> cases?

Yes, I’ve implemented such optimization. I started using two free bits
in the PTE for some “popular” types:
   static p2m_type_t p2m_get_type(struct p2m_domain *p2m, pte_t pte)
   {
       p2m_type_t type = MASK_EXTR(pte.pte, P2M_TYPE_PTE_BITS_MASK);
   
       if ( type == p2m_device_tree_type )
       {
	  ...
           ptr = radix_tree_lookup(&p2m->p2m_types, gfn_x(gfn));
           ...
           return radix_tree_ptr_to_int(ptr);
       }
   
       return type;
   }
   
   /*
    * In the case of the P2M, the valid bit is used for other purpose. Use
    * the type to check whether an entry is valid.
    */
   static inline bool p2m_is_valid(struct p2m_domain *p2m, pte_t pte)
   {
       return p2m_get_type(p2m, pte) != p2m_invalid;
   }


But thanks to your reply, I realized that in the case of|p2m_is_valid()|,
the implementation could be simplified further to:
   /*
    * In the case of the P2M, the valid bit is used for other purpose. Use
    * the type to check whether an entry is valid.
    */
   static inline bool p2m_is_valid(struct p2m_domain *p2m, pte_t pte)
   {
       return MASK_EXTR(pte.pte, P2M_TYPE_PTE_BITS_MASK) != p2m_invalid;
   }

As we care here only about whether the type is|p2m_invalid| or not,
and we don’t need the specific type here if it’s not|p2m_invalid|.

>
>>>> @@ -404,11 +426,127 @@ static int p2m_next_level(struct p2m_domain *p2m, bool alloc_tbl,
>>>>        return GUEST_TABLE_MAP_NONE;
>>>>    }
>>>>    
>>>> +static void p2m_put_foreign_page(struct page_info *pg)
>>>> +{
>>>> +    /*
>>>> +     * It's safe to do the put_page here because page_alloc will
>>>> +     * flush the TLBs if the page is reallocated before the end of
>>>> +     * this loop.
>>>> +     */
>>>> +    put_page(pg);
>>> Is the comment really true? The page allocator will flush the normal
>>> TLBs, but not the stage-2 ones. Yet those are what you care about here,
>>> aiui.
>> In alloc_heap_pages():
>>    ...
>>        if ( need_tlbflush )
>>           filtered_flush_tlb_mask(tlbflush_timestamp);
>>    ...
>>    
>> filtered_flush_tlb_mask() calls arch_flush_tlb_mask().
>>
>> and arch_flush_tlb_mask(), at least, on Arm (I haven't checked x86) is
>> implented as:
>>     void arch_flush_tlb_mask(const cpumask_t *mask)
>>     {
>>         /* No need to IPI other processors on ARM, the processor takes care of it. */
>>         flush_all_guests_tlb();
>>     }
>>
>> So it flushes stage-2 TLB.
> Hmm, okay. And I take it you have the same plan on RISC-V?

Yes, there is such a plan.

>   What I'd like to
> ask for, though, is that the comment (also) mentions where that (guest)
> flushing actually happens. That's not in page_alloc.c, and it also wasn't
> originally intended for guest TLBs to also be flushed from there (as x86 is
> where the flush avoidance machinery originates, which Arm and now also
> RISC-V don't really use).

Sure, it makes sense to update the comment.


>
>>>> +/* Put any references on the single 4K page referenced by mfn. */
>>>> +static void p2m_put_4k_page(mfn_t mfn, p2m_type_t type)
>>>> +{
>>>> +    /* TODO: Handle other p2m types */
>>>> +
>>>> +    /* Detect the xenheap page and mark the stored GFN as invalid. */
>>>> +    if ( p2m_is_ram(type) && is_xen_heap_mfn(mfn) )
>>>> +        page_set_xenheap_gfn(mfn_to_page(mfn), INVALID_GFN);
>>> Is this a valid thing to do? How do you make sure the respective uses
>>> (in gnttab's shared and status page arrays) are / were also removed?
>> As grant table frame GFN is stored directly in struct page_info instead
>> of keeping it in standalone status/shared arrays, thereby there is no need
>> for status/shared arrays.
> I fear I don't follow. Looking at Arm's header (which I understand you
> derive from), I see
>
> #define gnttab_shared_page(t, i)   virt_to_page((t)->shared_raw[i])
>
> #define gnttab_status_page(t, i)   virt_to_page((t)->status[i])
>
> Are you intending to do things differently?

I missed these arrays... Arm had different arrays:
-    (gt)->arch.shared_gfn = xmalloc_array(gfn_t, ngf_);                  \
-    (gt)->arch.status_gfn = xmalloc_array(gfn_t, nsf_);                  \

I think I don't know the answer to your question, as I'm not deeply familiar
with grant tables and would need to do some additional investigation.

And just to be sure I understand your question correctly: are you asking
whether I marked a page as|INVALID_GFN| while a domain might still be using
it for grant table purposes?

>
>>>> +/* Put any references on the superpage referenced by mfn. */
>>>> +static void p2m_put_2m_superpage(mfn_t mfn, p2m_type_t type)
>>>> +{
>>>> +    struct page_info *pg;
>>>> +    unsigned int i;
>>>> +
>>>> +    ASSERT(mfn_valid(mfn));
>>>> +
>>>> +    pg = mfn_to_page(mfn);
>>>> +
>>>> +    for ( i = 0; i < XEN_PT_ENTRIES; i++, pg++ )
>>>> +        p2m_put_foreign_page(pg);
>>>> +}
>>>> +
>>>> +/* Put any references on the page referenced by pte. */
>>>> +static void p2m_put_page(struct p2m_domain *p2m, const pte_t pte,
>>>> +                         unsigned int level)
>>>> +{
>>>> +    mfn_t mfn = pte_get_mfn(pte);
>>>> +    p2m_type_t p2m_type = p2m_type_radix_get(p2m, pte);
>>> This gives you the type of the 1st page. What guarantees that all other pages
>>> in a superpage are of the exact same type?
>> Doesn't superpage mean that all the 4KB pages within that superpage have the
>> same type and contiguous in memory?
> If the mapping is a super-page one - yes. Yet I see nothing super-page-ish
> here.

Probably, I just misunderstood your reply, but there is a check below:
     if ( level == 2 )
         return p2m_put_l2_superpage(mfn, pte.p2m.type);
And I expect that if|level == 2|, it means it is a superpage, which means that
all the 4KB pages within that superpage share the same type and are contiguous
in memory.


>
>>>> +    if ( level == 1 )
>>>> +        return p2m_put_2m_superpage(mfn, p2m_type);
>>>> +    else if ( level == 0 )
>>>> +        return p2m_put_4k_page(mfn, p2m_type);
>>> Use switch() right away?
>> It could be, I think that no big difference at the moment, at least.
>> But I am okay to rework it.
> If you don't want to use switch() here, then my other style nit would
> need giving: Please avoid "else" in situations like this.
>
>>>> +static void p2m_free_page(struct domain *d, struct page_info *pg)
>>>> +{
>>>> +    if ( is_hardware_domain(d) )
>>>> +        free_domheap_page(pg);
>>> Why's the hardware domain different here? It should have a pool just like
>>> all other domains have.
>> Hardware domain (dom0) should be no limit in the number of pages that can
>> be allocated, so allocate p2m pages for hardware domain is done from heap.
>>
>> An idea of p2m pool is to provide a way how to put clear limit and amount
>> to the p2m allocation.
> Well, we had been there on another thread, and I outlined how I think
> Dom0 may want handling.

I think that I don't remember. Could you please remind me what was that thread?
Probably, do you mean this reply:https://lore.kernel.org/xen-devel/cover.1749555949.git.oleksii.kurochko@gmail.com/T/#m4789842aaae1653b91d3368f66cadb0ef87fb17e ?
But this is not really about Dom0 case.

>
>>>>    /* Free pte sub-tree behind an entry */
>>>>    static void p2m_free_entry(struct p2m_domain *p2m,
>>>>                               pte_t entry, unsigned int level)
>>>>    {
>>>> -    panic("%s: hasn't been implemented yet\n", __func__);
>>>> +    unsigned int i;
>>>> +    pte_t *table;
>>>> +    mfn_t mfn;
>>>> +    struct page_info *pg;
>>>> +
>>>> +    /* Nothing to do if the entry is invalid. */
>>>> +    if ( !p2me_is_valid(p2m, entry) )
>>>> +        return;
>>> Does this actually apply to intermediate page tables (which you handle
>>> later in the function), when that's (only) a P2M type check?
>> Yes, any PTE should have V bit set to 1, so from P2M perspective it also
>> should be, at least, not equal to p2m_invalid.
> I don't follow. Where would that type be set? The radix tree being GFN-
> indexed, you would need to "invent" a GFN for every intermediate page table,
> just to be able to (legitimately) invoke the type retrieval function.

Maybe, it is incorrect, but in this patch series the type is set when
|page_to_p2m_table|() is called, which get as an argument a page correspondent
to a table. And then GFN is calculated based on this mfn:
staticpte_tpage_to_p2m_table(structp2m_domain *p2m, structpage_info *page)
{
/*
*Since this function generates a table entry, according to "Encoding
* of PTE R/W/X fields," the entry's r, w, and x fields must be set to 0
*to point to the next level of the page table.
* Therefore,to ensure that anentry is a page table entry,
* `p2m_access_n2rwx`is passed to `mfn_to_p2m_entry()` as the access value,
*which overrides whatever was passed as `p2m_type_t` and guarantees that
*the entry is apage table entry by setting r = w = x = 0.
*/
returnp2m_entry_from_mfn(p2m, page_to_mfn(page), p2m_ram_rw, 
p2m_access_n2rwx);
}
where:
   static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn, p2m_type_t t, p2m_access_t a)
   {
       ...
   
       pte_set_mfn(&e, mfn);
   
       BUG_ON(p2m_type_radix_set(p2m, e, t));
   
       return e;
   }
   
and where:
   static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
   {
       int rc;
       gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
   
       rc = radix_tree_insert(&p2m->p2m_types, gfn_x(gfn),
                              radix_tree_int_to_ptr(t));
       ....
   }

But as you mentioned below ...

>   Maybe
> you mean to leverage that (now, i.e. post-v2) you encode some of the types
> directly in the PTE, and p2m_invalid may be one of them. But that wasn't
> the case in the v2 submission, and hence the code looked wrong to me. Which
> in turn suggests that at least some better commentary is going to be needed,
> maybe even some BUILD_BUG_ON().

... p2m_invalid type will be encoded directly in the PTE in the next patch version.

~ Oleksii

Re: [PATCH v2 12/17] xen/riscv: Implement p2m_free_entry() and related helpers

Posted by Jan Beulich 3 months, 2 weeks ago

On 14.07.2025 18:01, Oleksii Kurochko wrote:
> On 7/14/25 9:15 AM, Jan Beulich wrote:
>> On 11.07.2025 17:56, Oleksii Kurochko wrote:
>>> On 7/1/25 4:23 PM, Jan Beulich wrote:
>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>> +/* Put any references on the single 4K page referenced by mfn. */
>>>>> +static void p2m_put_4k_page(mfn_t mfn, p2m_type_t type)
>>>>> +{
>>>>> +    /* TODO: Handle other p2m types */
>>>>> +
>>>>> +    /* Detect the xenheap page and mark the stored GFN as invalid. */
>>>>> +    if ( p2m_is_ram(type) && is_xen_heap_mfn(mfn) )
>>>>> +        page_set_xenheap_gfn(mfn_to_page(mfn), INVALID_GFN);
>>>> Is this a valid thing to do? How do you make sure the respective uses
>>>> (in gnttab's shared and status page arrays) are / were also removed?
>>> As grant table frame GFN is stored directly in struct page_info instead
>>> of keeping it in standalone status/shared arrays, thereby there is no need
>>> for status/shared arrays.
>> I fear I don't follow. Looking at Arm's header (which I understand you
>> derive from), I see
>>
>> #define gnttab_shared_page(t, i)   virt_to_page((t)->shared_raw[i])
>>
>> #define gnttab_status_page(t, i)   virt_to_page((t)->status[i])
>>
>> Are you intending to do things differently?
> 
> I missed these arrays... Arm had different arrays:
> -    (gt)->arch.shared_gfn = xmalloc_array(gfn_t, ngf_);                  \
> -    (gt)->arch.status_gfn = xmalloc_array(gfn_t, nsf_);                  \
> 
> I think I don't know the answer to your question, as I'm not deeply familiar
> with grant tables and would need to do some additional investigation.
> 
> And just to be sure I understand your question correctly: are you asking
> whether I marked a page as|INVALID_GFN| while a domain might still be using
> it for grant table purposes?

Not quite. I'm trying to indicate that you may leave stale information around
when you update the struct page_info instance without also updating one of the
array slots. IOW I think both updates need to happen in sync, or it needs to
be explained why not doing so is still okay.

>>>>> +/* Put any references on the superpage referenced by mfn. */
>>>>> +static void p2m_put_2m_superpage(mfn_t mfn, p2m_type_t type)
>>>>> +{
>>>>> +    struct page_info *pg;
>>>>> +    unsigned int i;
>>>>> +
>>>>> +    ASSERT(mfn_valid(mfn));
>>>>> +
>>>>> +    pg = mfn_to_page(mfn);
>>>>> +
>>>>> +    for ( i = 0; i < XEN_PT_ENTRIES; i++, pg++ )
>>>>> +        p2m_put_foreign_page(pg);
>>>>> +}
>>>>> +
>>>>> +/* Put any references on the page referenced by pte. */
>>>>> +static void p2m_put_page(struct p2m_domain *p2m, const pte_t pte,
>>>>> +                         unsigned int level)
>>>>> +{
>>>>> +    mfn_t mfn = pte_get_mfn(pte);
>>>>> +    p2m_type_t p2m_type = p2m_type_radix_get(p2m, pte);
>>>> This gives you the type of the 1st page. What guarantees that all other pages
>>>> in a superpage are of the exact same type?
>>> Doesn't superpage mean that all the 4KB pages within that superpage have the
>>> same type and contiguous in memory?
>> If the mapping is a super-page one - yes. Yet I see nothing super-page-ish
>> here.
> 
> Probably, I just misunderstood your reply, but there is a check below:
>      if ( level == 2 )
>          return p2m_put_l2_superpage(mfn, pte.p2m.type);
> And I expect that if|level == 2|, it means it is a superpage, which means that
> all the 4KB pages within that superpage share the same type and are contiguous
> in memory.

Let's hope that all of this is going to remain consistent then.

>>>>> +static void p2m_free_page(struct domain *d, struct page_info *pg)
>>>>> +{
>>>>> +    if ( is_hardware_domain(d) )
>>>>> +        free_domheap_page(pg);
>>>> Why's the hardware domain different here? It should have a pool just like
>>>> all other domains have.
>>> Hardware domain (dom0) should be no limit in the number of pages that can
>>> be allocated, so allocate p2m pages for hardware domain is done from heap.
>>>
>>> An idea of p2m pool is to provide a way how to put clear limit and amount
>>> to the p2m allocation.
>> Well, we had been there on another thread, and I outlined how I think
>> Dom0 may want handling.
> 
> I think that I don't remember. Could you please remind me what was that thread?
> Probably, do you mean this reply:https://lore.kernel.org/xen-devel/cover.1749555949.git.oleksii.kurochko@gmail.com/T/#m4789842aaae1653b91d3368f66cadb0ef87fb17e ?
> But this is not really about Dom0 case.

It would have been where the allocation counterpart to the freeing here is,
I expect.

Jan