[v2] xen/riscv: introduce p2m functionality

[PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Oleksii Kurochko 4 months, 3 weeks ago

This patch introduces p2m_set_entry() and its core helper __p2m_set_entry() for
RISC-V, based loosely on the Arm implementation, with several RISC-V-specific
modifications.

Key differences include:
- TLB Flushing: RISC-V allows caching of invalid PTEs and does not require
  break-before-make (BBM). As a result, the flushing logic is simplified.
  TLB invalidation can be deferred until p2m_write_unlock() is called.
  Consequently, the p2m->need_flush flag is always considered true and is
  removed.
- Page Table Traversal: The order of walking the page tables differs from Arm,
  and this implementation reflects that reversed traversal.
- Macro Adjustments: The macros P2M_ROOT_LEVEL, P2M_ROOT_ORDER, and
  P2M_ROOT_PAGES are updated to align with the new RISC-V implementation.

The main functionality is in __p2m_set_entry(), which handles mappings aligned
to page table block entries (e.g., 1GB, 2MB, or 4KB with 4KB granularity).

p2m_set_entry() breaks a region down into block-aligned mappings and calls
__p2m_set_entry() accordingly.

Stub implementations (to be completed later) include:
- p2m_free_entry()
- p2m_next_level()
- p2m_entry_from_mfn()
- p2me_is_valid()

Note: Support for shattering block entries is not implemented in this patch
and will be added separately.

Additionally, some straightforward helper functions are now implemented:
- p2m_write_pte()
- p2m_remove_pte()
- p2m_get_root_pointer()

Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
---
Changes in V2:
 - New patch. It was a part of a big patch "xen/riscv: implement p2m mapping
   functionality" which was splitted to smaller.
 - Update the way when p2m TLB is flushed:
 - RISC-V does't require BBM so there is no need to remove PTE before making
   new so drop 'if /*pte_is_valid(orig_pte) */' and remove PTE only removing
   has been requested.
 - Drop p2m->need_flush |= !!pte_is_valid(orig_pte); for the case when
   PTE's removing is happening as RISC-V could cache invalid PTE and thereby
   it requires to do a flush each time and it doesn't matter if PTE is valid
   or not at the moment when PTE removing is happening.
 - Drop a check if PTE is valid in case of PTE is modified as it was mentioned
   above as BBM isn't required so TLB flushing could be defered and there is
   no need to do it before modifying of PTE.
 - Drop p2m->need_flush as it seems like it will be always true.
 - Drop foreign mapping things as it isn't necessary for RISC-V right now.
 - s/p2m_is_valid/p2me_is_valid.
 - Move definition and initalization of p2m->{max_mapped_gfn,lowest_mapped_gfn}
   to this patch.
---
 xen/arch/riscv/include/asm/p2m.h |  16 ++
 xen/arch/riscv/p2m.c             | 260 ++++++++++++++++++++++++++++++-
 2 files changed, 275 insertions(+), 1 deletion(-)

diff --git a/xen/arch/riscv/include/asm/p2m.h b/xen/arch/riscv/include/asm/p2m.h
index af2025b9fd..fdebd18356 100644
--- a/xen/arch/riscv/include/asm/p2m.h
+++ b/xen/arch/riscv/include/asm/p2m.h
@@ -9,8 +9,13 @@
 #include <xen/rwlock.h>
 #include <xen/types.h>
 
+#include <asm/page.h>
 #include <asm/page-bits.h>
 
+#define P2M_ROOT_LEVEL  HYP_PT_ROOT_LEVEL
+#define P2M_ROOT_ORDER  XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL)
+#define P2M_ROOT_PAGES  BIT(P2M_ROOT_ORDER, U)
+
 #define paddr_bits PADDR_BITS
 
 /* Get host p2m table */
@@ -49,6 +54,17 @@ struct p2m_domain {
 
     /* Current VMID in use */
     uint16_t vmid;
+
+    /* Highest guest frame that's ever been mapped in the p2m */
+    gfn_t max_mapped_gfn;
+
+    /*
+     * Lowest mapped gfn in the p2m. When releasing mapped gfn's in a
+     * preemptible manner this is update to track recall where to
+     * resume the search. Apart from during teardown this can only
+     * decrease.
+     */
+    gfn_t lowest_mapped_gfn;
 };
 
 /*
diff --git a/xen/arch/riscv/p2m.c b/xen/arch/riscv/p2m.c
index cea37c8bda..27499a86bb 100644
--- a/xen/arch/riscv/p2m.c
+++ b/xen/arch/riscv/p2m.c
@@ -231,6 +231,8 @@ int p2m_init(struct domain *d)
     INIT_PAGE_LIST_HEAD(&p2m->pages);
 
     p2m->vmid = INVALID_VMID;
+    p2m->max_mapped_gfn = _gfn(0);
+    p2m->lowest_mapped_gfn = _gfn(ULONG_MAX);
 
     p2m->default_access = p2m_access_rwx;
 
@@ -325,6 +327,214 @@ int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted)
     return 0;
 }
 
+/*
+ * Find and map the root page table. The caller is responsible for
+ * unmapping the table.
+ *
+ * The function will return NULL if the offset of the root table is
+ * invalid.
+ */
+static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
+{
+    unsigned long root_table_indx;
+
+    root_table_indx = gfn_x(gfn) >> XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL);
+    if ( root_table_indx >= P2M_ROOT_PAGES )
+        return NULL;
+
+    return __map_domain_page(p2m->root + root_table_indx);
+}
+
+static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
+{
+    panic("%s: isn't implemented for now\n", __func__);
+
+    return false;
+}
+
+static inline void p2m_write_pte(pte_t *p, pte_t pte, bool clean_pte)
+{
+    write_pte(p, pte);
+    if ( clean_pte )
+        clean_dcache_va_range(p, sizeof(*p));
+}
+
+static inline void p2m_remove_pte(pte_t *p, bool clean_pte)
+{
+    pte_t pte;
+
+    memset(&pte, 0x00, sizeof(pte));
+    p2m_write_pte(p, pte, clean_pte);
+}
+
+static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn,
+                                p2m_type_t t, p2m_access_t a)
+{
+    panic("%s: hasn't been implemented yet\n", __func__);
+
+    return (pte_t) { .pte = 0 };
+}
+
+#define GUEST_TABLE_MAP_NONE 0
+#define GUEST_TABLE_MAP_NOMEM 1
+#define GUEST_TABLE_SUPER_PAGE 2
+#define GUEST_TABLE_NORMAL 3
+
+/*
+ * Take the currently mapped table, find the corresponding GFN entry,
+ * and map the next table, if available. The previous table will be
+ * unmapped if the next level was mapped (e.g GUEST_TABLE_NORMAL
+ * returned).
+ *
+ * `alloc_tbl` parameter indicates whether intermediate tables should
+ * be allocated when not present.
+ *
+ * Return values:
+ *  GUEST_TABLE_MAP_NONE: a table allocation isn't permitted.
+ *  GUEST_TABLE_MAP_NOMEM: allocating a new page failed.
+ *  GUEST_TABLE_SUPER_PAGE: next level or leaf mapped normally.
+ *  GUEST_TABLE_NORMAL: The next entry points to a superpage.
+ */
+static int p2m_next_level(struct p2m_domain *p2m, bool alloc_tbl,
+                          unsigned int level, pte_t **table,
+                          unsigned int offset)
+{
+    panic("%s: hasn't been implemented yet\n", __func__);
+
+    return GUEST_TABLE_MAP_NONE;
+}
+
+/* Free pte sub-tree behind an entry */
+static void p2m_free_entry(struct p2m_domain *p2m,
+                           pte_t entry, unsigned int level)
+{
+    panic("%s: hasn't been implemented yet\n", __func__);
+}
+
+/*
+ * Insert an entry in the p2m. This should be called with a mapping
+ * equal to a page/superpage.
+ */
+static int __p2m_set_entry(struct p2m_domain *p2m,
+                           gfn_t sgfn,
+                           unsigned int page_order,
+                           mfn_t smfn,
+                           p2m_type_t t,
+                           p2m_access_t a)
+{
+    unsigned int level;
+    unsigned int target = page_order / PAGETABLE_ORDER;
+    pte_t *entry, *table, orig_pte;
+    int rc;
+    /* A mapping is removed if the MFN is invalid. */
+    bool removing_mapping = mfn_eq(smfn, INVALID_MFN);
+    DECLARE_OFFSETS(offsets, gfn_to_gaddr(sgfn));
+
+    ASSERT(p2m_is_write_locked(p2m));
+
+    /*
+     * Check if the level target is valid: we only support
+     * 4K - 2M - 1G mapping.
+     */
+    ASSERT(target <= 2);
+
+    table = p2m_get_root_pointer(p2m, sgfn);
+    if ( !table )
+        return -EINVAL;
+
+    for ( level = P2M_ROOT_LEVEL; level > target; level-- )
+    {
+        /*
+         * Don't try to allocate intermediate page table if the mapping
+         * is about to be removed.
+         */
+        rc = p2m_next_level(p2m, !removing_mapping,
+                            level, &table, offsets[level]);
+        if ( (rc == GUEST_TABLE_MAP_NONE) || (rc == GUEST_TABLE_MAP_NOMEM) )
+        {
+            /*
+             * We are here because p2m_next_level has failed to map
+             * the intermediate page table (e.g the table does not exist
+             * and they p2m tree is read-only). It is a valid case
+             * when removing a mapping as it may not exist in the
+             * page table. In this case, just ignore it.
+             */
+            rc = removing_mapping ?  0 : -ENOENT;
+            goto out;
+        }
+        else if ( rc != GUEST_TABLE_NORMAL )
+            break;
+    }
+
+    entry = table + offsets[level];
+
+    /*
+     * If we are here with level > target, we must be at a leaf node,
+     * and we need to break up the superpage.
+     */
+    if ( level > target )
+    {
+        panic("Shattering isn't implemented\n");
+    }
+
+    /*
+     * We should always be there with the correct level because
+     * all the intermediate tables have been installed if necessary.
+     */
+    ASSERT(level == target);
+
+    orig_pte = *entry;
+
+    /*
+     * The access type should always be p2m_access_rwx when the mapping
+     * is removed.
+     */
+    ASSERT(!mfn_eq(INVALID_MFN, smfn) || (a == p2m_access_rwx));
+
+    if ( removing_mapping )
+        p2m_remove_pte(entry, p2m->clean_pte);
+    else {
+        pte_t pte = p2m_entry_from_mfn(p2m, smfn, t, a);
+
+        p2m_write_pte(entry, pte, p2m->clean_pte);
+
+        p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn,
+                                      gfn_add(sgfn, (1UL << page_order) - 1));
+        p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn);
+    }
+
+#ifdef CONFIG_HAS_PASSTHROUGH
+    if ( is_iommu_enabled(p2m->domain) &&
+         (pte_is_valid(orig_pte) || pte_is_valid(*entry)) )
+    {
+        unsigned int flush_flags = 0;
+
+        if ( pte_is_valid(orig_pte) )
+            flush_flags |= IOMMU_FLUSHF_modified;
+        if ( pte_is_valid(*entry) )
+            flush_flags |= IOMMU_FLUSHF_added;
+
+        rc = iommu_iotlb_flush(p2m->domain, _dfn(gfn_x(sgfn)),
+                               1UL << page_order, flush_flags);
+    }
+    else
+#endif
+        rc = 0;
+
+    /*
+     * Free the entry only if the original pte was valid and the base
+     * is different (to avoid freeing when permission is changed).
+     */
+    if ( p2me_is_valid(p2m, orig_pte) &&
+         !mfn_eq(pte_get_mfn(*entry), pte_get_mfn(orig_pte)) )
+        p2m_free_entry(p2m, orig_pte, level);
+
+out:
+    unmap_domain_page(table);
+
+    return rc;
+}
+
 static int p2m_set_entry(struct p2m_domain *p2m,
                          gfn_t sgfn,
                          unsigned long nr,
@@ -332,7 +542,55 @@ static int p2m_set_entry(struct p2m_domain *p2m,
                          p2m_type_t t,
                          p2m_access_t a)
 {
-    return -EOPNOTSUPP;
+    int rc = 0;
+
+    /*
+     * Any reference taken by the P2M mappings (e.g. foreign mapping) will
+     * be dropped in relinquish_p2m_mapping(). As the P2M will still
+     * be accessible after, we need to prevent mapping to be added when the
+     * domain is dying.
+     */
+    if ( unlikely(p2m->domain->is_dying) )
+        return -ENOMEM;
+
+    while ( nr )
+    {
+        unsigned long mask;
+        unsigned long order = 0;
+        /* 1gb, 2mb, 4k mappings are supported */
+        unsigned int i = ( P2M_ROOT_LEVEL > 2 ) ? 2 : P2M_ROOT_LEVEL;
+
+        /*
+         * Don't take into account the MFN when removing mapping (i.e
+         * MFN_INVALID) to calculate the correct target order.
+         *
+         * XXX: Support superpage mappings if nr is not aligned to a
+         * superpage size.
+         */
+        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
+        mask |= gfn_x(sgfn) | nr;
+
+        for ( ; i != 0; i-- )
+        {
+            if ( !(mask & (BIT(XEN_PT_LEVEL_ORDER(i), UL) - 1)) )
+            {
+                    order = XEN_PT_LEVEL_ORDER(i);
+                    break;
+            }
+        }
+
+        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
+        if ( rc )
+            break;
+
+        sgfn = gfn_add(sgfn, (1 << order));
+        if ( !mfn_eq(smfn, INVALID_MFN) )
+           smfn = mfn_add(smfn, (1 << order));
+
+        nr -= (1 << order);
+    }
+
+    return rc;
 }
 
 static int p2m_insert_mapping(struct domain *d, gfn_t start_gfn,
-- 
2.49.0

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Jan Beulich 4 months ago

On 10.06.2025 15:05, Oleksii Kurochko wrote:
> This patch introduces p2m_set_entry() and its core helper __p2m_set_entry() for
> RISC-V, based loosely on the Arm implementation, with several RISC-V-specific
> modifications.
> 
> Key differences include:
> - TLB Flushing: RISC-V allows caching of invalid PTEs and does not require
>   break-before-make (BBM). As a result, the flushing logic is simplified.
>   TLB invalidation can be deferred until p2m_write_unlock() is called.
>   Consequently, the p2m->need_flush flag is always considered true and is
>   removed.
> - Page Table Traversal: The order of walking the page tables differs from Arm,
>   and this implementation reflects that reversed traversal.
> - Macro Adjustments: The macros P2M_ROOT_LEVEL, P2M_ROOT_ORDER, and
>   P2M_ROOT_PAGES are updated to align with the new RISC-V implementation.
> 
> The main functionality is in __p2m_set_entry(), which handles mappings aligned
> to page table block entries (e.g., 1GB, 2MB, or 4KB with 4KB granularity).
> 
> p2m_set_entry() breaks a region down into block-aligned mappings and calls
> __p2m_set_entry() accordingly.
> 
> Stub implementations (to be completed later) include:
> - p2m_free_entry()

What would a function of this name do? You can clear entries, but you can't
free them, can you?

> --- a/xen/arch/riscv/include/asm/p2m.h
> +++ b/xen/arch/riscv/include/asm/p2m.h
> @@ -9,8 +9,13 @@
>  #include <xen/rwlock.h>
>  #include <xen/types.h>
>  
> +#include <asm/page.h>
>  #include <asm/page-bits.h>
>  
> +#define P2M_ROOT_LEVEL  HYP_PT_ROOT_LEVEL
> +#define P2M_ROOT_ORDER  XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL)

This is confusing, as in patch 6 we see that p2m root table order is 2.
Something needs doing about the naming, so the two sets of things can't
be confused.

> @@ -49,6 +54,17 @@ struct p2m_domain {
>  
>      /* Current VMID in use */
>      uint16_t vmid;
> +
> +    /* Highest guest frame that's ever been mapped in the p2m */
> +    gfn_t max_mapped_gfn;
> +
> +    /*
> +     * Lowest mapped gfn in the p2m. When releasing mapped gfn's in a
> +     * preemptible manner this is update to track recall where to
> +     * resume the search. Apart from during teardown this can only
> +     * decrease.
> +     */
> +    gfn_t lowest_mapped_gfn;

When you copied the comment, you surely read it. Yet you copied pretty
obvious flaws as-is. That is s/update/updated/, and something wants
doing about "track recall", which makes no sense to me.

> --- a/xen/arch/riscv/p2m.c
> +++ b/xen/arch/riscv/p2m.c
> @@ -231,6 +231,8 @@ int p2m_init(struct domain *d)
>      INIT_PAGE_LIST_HEAD(&p2m->pages);
>  
>      p2m->vmid = INVALID_VMID;
> +    p2m->max_mapped_gfn = _gfn(0);
> +    p2m->lowest_mapped_gfn = _gfn(ULONG_MAX);
>  
>      p2m->default_access = p2m_access_rwx;
>  
> @@ -325,6 +327,214 @@ int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted)
>      return 0;
>  }
>  
> +/*
> + * Find and map the root page table. The caller is responsible for
> + * unmapping the table.
> + *
> + * The function will return NULL if the offset of the root table is
> + * invalid.

Don't you mean "offset into ..."?

> + */
> +static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
> +{
> +    unsigned long root_table_indx;
> +
> +    root_table_indx = gfn_x(gfn) >> XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL);
> +    if ( root_table_indx >= P2M_ROOT_PAGES )
> +        return NULL;
> +
> +    return __map_domain_page(p2m->root + root_table_indx);
> +}
> +
> +static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)

The rule of thumb is to have inline functions only in header files, leaving
decisions to the compiler elsewhere.

> +{
> +    panic("%s: isn't implemented for now\n", __func__);
> +
> +    return false;
> +}

For this function in particular, though: Besides the "p2me" in the name
being somewhat odd (supposedly page table entries here are simply pte_t),
how is this going to be different from pte_is_valid()?

> +static inline void p2m_write_pte(pte_t *p, pte_t pte, bool clean_pte)
> +{
> +    write_pte(p, pte);
> +    if ( clean_pte )
> +        clean_dcache_va_range(p, sizeof(*p));
> +}
> +
> +static inline void p2m_remove_pte(pte_t *p, bool clean_pte)
> +{
> +    pte_t pte;
> +
> +    memset(&pte, 0x00, sizeof(pte));
> +    p2m_write_pte(p, pte, clean_pte);
> +}

May I suggest "clear" instead of "remove" and plain 0 instead of 0x00
(or simply give the variable a trivial initializer)?

As to the earlier function that I commented on: Seeing the names here,
wouldn't p2m_pte_is_valid() be a more consistent name there?

> +static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn,
> +                                p2m_type_t t, p2m_access_t a)
> +{
> +    panic("%s: hasn't been implemented yet\n", __func__);
> +
> +    return (pte_t) { .pte = 0 };
> +}

And then perhaps p2m_pte_from_mfn() here?

> +#define GUEST_TABLE_MAP_NONE 0
> +#define GUEST_TABLE_MAP_NOMEM 1
> +#define GUEST_TABLE_SUPER_PAGE 2
> +#define GUEST_TABLE_NORMAL 3

Is GUEST_ a good prefix? The guest doesn't control these tables, and the
word could also mean the guest's own page tables.

> +/*
> + * Take the currently mapped table, find the corresponding GFN entry,

That's not what you mean though, is it? It's more like "the entry
corresponding to the GFN" (implying "at the given level").

> + * and map the next table, if available. The previous table will be
> + * unmapped if the next level was mapped (e.g GUEST_TABLE_NORMAL
> + * returned).
> + *
> + * `alloc_tbl` parameter indicates whether intermediate tables should
> + * be allocated when not present.
> + *
> + * Return values:
> + *  GUEST_TABLE_MAP_NONE: a table allocation isn't permitted.
> + *  GUEST_TABLE_MAP_NOMEM: allocating a new page failed.
> + *  GUEST_TABLE_SUPER_PAGE: next level or leaf mapped normally.
> + *  GUEST_TABLE_NORMAL: The next entry points to a superpage.
> + */
> +static int p2m_next_level(struct p2m_domain *p2m, bool alloc_tbl,
> +                          unsigned int level, pte_t **table,
> +                          unsigned int offset)
> +{
> +    panic("%s: hasn't been implemented yet\n", __func__);
> +
> +    return GUEST_TABLE_MAP_NONE;
> +}
> +
> +/* Free pte sub-tree behind an entry */
> +static void p2m_free_entry(struct p2m_domain *p2m,
> +                           pte_t entry, unsigned int level)
> +{
> +    panic("%s: hasn't been implemented yet\n", __func__);
> +}
> +
> +/*
> + * Insert an entry in the p2m. This should be called with a mapping
> + * equal to a page/superpage.
> + */
> +static int __p2m_set_entry(struct p2m_domain *p2m,

No double leading underscores, please. A single one is fine and will do.

> +                           gfn_t sgfn,
> +                           unsigned int page_order,
> +                           mfn_t smfn,

What are the "s" in "sgfn" and "smfn" indicating? Possibly "start", except
that you don't process multiple GFNs here (unlike in the caller).

> +                           p2m_type_t t,
> +                           p2m_access_t a)
> +{
> +    unsigned int level;
> +    unsigned int target = page_order / PAGETABLE_ORDER;
> +    pte_t *entry, *table, orig_pte;
> +    int rc;
> +    /* A mapping is removed if the MFN is invalid. */
> +    bool removing_mapping = mfn_eq(smfn, INVALID_MFN);
> +    DECLARE_OFFSETS(offsets, gfn_to_gaddr(sgfn));
> +
> +    ASSERT(p2m_is_write_locked(p2m));
> +
> +    /*
> +     * Check if the level target is valid: we only support
> +     * 4K - 2M - 1G mapping.
> +     */
> +    ASSERT(target <= 2);

No provisions towards the division that produced the value having left
a remainder?

> +    table = p2m_get_root_pointer(p2m, sgfn);
> +    if ( !table )
> +        return -EINVAL;
> +
> +    for ( level = P2M_ROOT_LEVEL; level > target; level-- )
> +    {
> +        /*
> +         * Don't try to allocate intermediate page table if the mapping
> +         * is about to be removed.
> +         */
> +        rc = p2m_next_level(p2m, !removing_mapping,
> +                            level, &table, offsets[level]);
> +        if ( (rc == GUEST_TABLE_MAP_NONE) || (rc == GUEST_TABLE_MAP_NOMEM) )
> +        {
> +            /*
> +             * We are here because p2m_next_level has failed to map
> +             * the intermediate page table (e.g the table does not exist
> +             * and they p2m tree is read-only). It is a valid case
> +             * when removing a mapping as it may not exist in the
> +             * page table. In this case, just ignore it.
> +             */
> +            rc = removing_mapping ?  0 : -ENOENT;

Shouldn't GUEST_TABLE_MAP_NOMEM be transformed to -ENOMEM?

> +            goto out;
> +        }
> +        else if ( rc != GUEST_TABLE_NORMAL )

No need for "else" here.

> +            break;
> +    }
> +
> +    entry = table + offsets[level];
> +
> +    /*
> +     * If we are here with level > target, we must be at a leaf node,
> +     * and we need to break up the superpage.
> +     */
> +    if ( level > target )
> +    {
> +        panic("Shattering isn't implemented\n");
> +    }
> +
> +    /*
> +     * We should always be there with the correct level because
> +     * all the intermediate tables have been installed if necessary.
> +     */
> +    ASSERT(level == target);
> +
> +    orig_pte = *entry;
> +
> +    /*
> +     * The access type should always be p2m_access_rwx when the mapping
> +     * is removed.
> +     */
> +    ASSERT(!mfn_eq(INVALID_MFN, smfn) || (a == p2m_access_rwx));
> +
> +    if ( removing_mapping )
> +        p2m_remove_pte(entry, p2m->clean_pte);
> +    else {

Nit: Style.

> +        pte_t pte = p2m_entry_from_mfn(p2m, smfn, t, a);
> +
> +        p2m_write_pte(entry, pte, p2m->clean_pte);
> +
> +        p2m->max_mapped_gfn = gfn_max(p2m->max_mapped_gfn,
> +                                      gfn_add(sgfn, (1UL << page_order) - 1));
> +        p2m->lowest_mapped_gfn = gfn_min(p2m->lowest_mapped_gfn, sgfn);
> +    }
> +
> +#ifdef CONFIG_HAS_PASSTHROUGH

See my earlier comment regarding this kind of #ifdef.

> @@ -332,7 +542,55 @@ static int p2m_set_entry(struct p2m_domain *p2m,
>                           p2m_type_t t,
>                           p2m_access_t a)
>  {
> -    return -EOPNOTSUPP;
> +    int rc = 0;
> +
> +    /*
> +     * Any reference taken by the P2M mappings (e.g. foreign mapping) will
> +     * be dropped in relinquish_p2m_mapping(). As the P2M will still
> +     * be accessible after, we need to prevent mapping to be added when the
> +     * domain is dying.
> +     */
> +    if ( unlikely(p2m->domain->is_dying) )
> +        return -ENOMEM;

Why ENOMEM?

> +    while ( nr )

Why's there a loop here? The function name uses singular, i.e. means to
create exactly one entry.

> +    {
> +        unsigned long mask;
> +        unsigned long order = 0;

unsigned int?

> +        /* 1gb, 2mb, 4k mappings are supported */
> +        unsigned int i = ( P2M_ROOT_LEVEL > 2 ) ? 2 : P2M_ROOT_LEVEL;

Not (style): Excess blanks. Yet then aren't you open-coding min() here
anyway? Plus isn't P2M_ROOT_LEVEL always >= 2?

> +        /*
> +         * Don't take into account the MFN when removing mapping (i.e
> +         * MFN_INVALID) to calculate the correct target order.
> +         *
> +         * XXX: Support superpage mappings if nr is not aligned to a
> +         * superpage size.
> +         */

Does this really need leaving as a to-do?

> +        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
> +        mask |= gfn_x(sgfn) | nr;
> +
> +        for ( ; i != 0; i-- )
> +        {
> +            if ( !(mask & (BIT(XEN_PT_LEVEL_ORDER(i), UL) - 1)) )
> +            {
> +                    order = XEN_PT_LEVEL_ORDER(i);
> +                    break;

Nit: Style.

> +            }
> +        }
> +
> +        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
> +        if ( rc )
> +            break;
> +
> +        sgfn = gfn_add(sgfn, (1 << order));
> +        if ( !mfn_eq(smfn, INVALID_MFN) )
> +           smfn = mfn_add(smfn, (1 << order));
> +
> +        nr -= (1 << order);

Throughout maybe better be safe right away and use 1UL?

> +    }
> +
> +    return rc;
>  }

How's the caller going to know how much of the range was successfully
mapped? That part may need undoing (if not here, then in the caller),
or a caller may want to retry.

Jan

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Oleksii Kurochko 3 months, 4 weeks ago

On 7/1/25 3:49 PM, Jan Beulich wrote:
> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>> This patch introduces p2m_set_entry() and its core helper __p2m_set_entry() for
>> RISC-V, based loosely on the Arm implementation, with several RISC-V-specific
>> modifications.
>>
>> Key differences include:
>> - TLB Flushing: RISC-V allows caching of invalid PTEs and does not require
>>    break-before-make (BBM). As a result, the flushing logic is simplified.
>>    TLB invalidation can be deferred until p2m_write_unlock() is called.
>>    Consequently, the p2m->need_flush flag is always considered true and is
>>    removed.
>> - Page Table Traversal: The order of walking the page tables differs from Arm,
>>    and this implementation reflects that reversed traversal.
>> - Macro Adjustments: The macros P2M_ROOT_LEVEL, P2M_ROOT_ORDER, and
>>    P2M_ROOT_PAGES are updated to align with the new RISC-V implementation.
>>
>> The main functionality is in __p2m_set_entry(), which handles mappings aligned
>> to page table block entries (e.g., 1GB, 2MB, or 4KB with 4KB granularity).
>>
>> p2m_set_entry() breaks a region down into block-aligned mappings and calls
>> __p2m_set_entry() accordingly.
>>
>> Stub implementations (to be completed later) include:
>> - p2m_free_entry()
> What would a function of this name do?

Recursively visiting all leaf PTE's for sub-tree behind an entry, then calls
put_page() (which will free if there is no any reference to this page),
freeing intermediate page table (after all entries were freed) by removing
it from d->arch.paging.freelist, and removes correspondent page of intermediate page
table from p2m->pages list.

> You can clear entries, but you can't
> free them, can you?

Is is a question regarding terminology? I can't free entry itself, but a page table or
a page (if it is a leaf entry) on which it points could free.

>> --- a/xen/arch/riscv/include/asm/p2m.h
>> +++ b/xen/arch/riscv/include/asm/p2m.h
>> @@ -9,8 +9,13 @@
>>   #include <xen/rwlock.h>
>>   #include <xen/types.h>
>>   
>> +#include <asm/page.h>
>>   #include <asm/page-bits.h>
>>   
>> +#define P2M_ROOT_LEVEL  HYP_PT_ROOT_LEVEL
>> +#define P2M_ROOT_ORDER  XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL)
> This is confusing, as in patch 6 we see that p2m root table order is 2.
> Something needs doing about the naming, so the two sets of things can't
> be confused.

Agree, confusing enough.

I will define|P2M_ROOT_ORDER| as|get_order_from_bytes(GUEST_ROOT_PAGE_TABLE_SIZE)|
(or declare a new variable to store this value).

Actually, the way it's currently defined was only needed for|p2m_get_root_pointer() |to find the root page table by GFN, but|XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL) |is used explicitly there, so I just missed doing a proper cleanup.


>> --- a/xen/arch/riscv/p2m.c
>> +++ b/xen/arch/riscv/p2m.c
>> @@ -231,6 +231,8 @@ int p2m_init(struct domain *d)
>>       INIT_PAGE_LIST_HEAD(&p2m->pages);
>>   
>>       p2m->vmid = INVALID_VMID;
>> +    p2m->max_mapped_gfn = _gfn(0);
>> +    p2m->lowest_mapped_gfn = _gfn(ULONG_MAX);
>>   
>>       p2m->default_access = p2m_access_rwx;
>>   
>> @@ -325,6 +327,214 @@ int p2m_set_allocation(struct domain *d, unsigned long pages, bool *preempted)
>>       return 0;
>>   }
>>   
>> +/*
>> + * Find and map the root page table. The caller is responsible for
>> + * unmapping the table.
>> + *
>> + * The function will return NULL if the offset of the root table is
>> + * invalid.
> Don't you mean "offset into ..."?

If you won't suggested that, I will think that the meaning of "of" and "into" is pretty close.
But it seems like semantically "into" is more accurate and better conveys the intent of the code.

>> + */
>> +static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>> +{
>> +    unsigned long root_table_indx;
>> +
>> +    root_table_indx = gfn_x(gfn) >> XEN_PT_LEVEL_ORDER(P2M_ROOT_LEVEL);
>> +    if ( root_table_indx >= P2M_ROOT_PAGES )
>> +        return NULL;
>> +
>> +    return __map_domain_page(p2m->root + root_table_indx);
>> +}
>> +
>> +static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
> The rule of thumb is to have inline functions only in header files, leaving
> decisions to the compiler elsewhere.

I am not sure what you mean in the second part (after coma) of your sentence.

>> +{
>> +    panic("%s: isn't implemented for now\n", __func__);
>> +
>> +    return false;
>> +}
> For this function in particular, though: Besides the "p2me" in the name
> being somewhat odd (supposedly page table entries here are simply pte_t),
> how is this going to be different from pte_is_valid()?

pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
what is a type stored in the radix tree (p2m->p2m_types):
   /*
    * In the case of the P2M, the valid bit is used for other purpose. Use
    * the type to check whether an entry is valid.
    */
   static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
   {
       return p2m_type_radix_get(p2m, pte) != p2m_invalid;
   }

It is done to track which page was modified by a guest.


>> +static inline void p2m_write_pte(pte_t *p, pte_t pte, bool clean_pte)
>> +{
>> +    write_pte(p, pte);
>> +    if ( clean_pte )
>> +        clean_dcache_va_range(p, sizeof(*p));
>> +}
>> +
>> +static inline void p2m_remove_pte(pte_t *p, bool clean_pte)
>> +{
>> +    pte_t pte;
>> +
>> +    memset(&pte, 0x00, sizeof(pte));
>> +    p2m_write_pte(p, pte, clean_pte);
>> +}
> May I suggest "clear" instead of "remove" and plain 0 instead of 0x00
> (or simply give the variable a trivial initializer)?

Sure, I will rename and use plain 0.

>
> As to the earlier function that I commented on: Seeing the names here,
> wouldn't p2m_pte_is_valid() be a more consistent name there?

Then all p2me_*() should be updated to p2m_pte_*().

But initial logic was that p2me = p2m entry = p2m page table entry.

Probably we can just return back to the prefix p2m_ as based on arguments
it is clear that it is a function for working with P2M's PTE.

>
>> +static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn,
>> +                                p2m_type_t t, p2m_access_t a)
>> +{
>> +    panic("%s: hasn't been implemented yet\n", __func__);
>> +
>> +    return (pte_t) { .pte = 0 };
>> +}
> And then perhaps p2m_pte_from_mfn() here?
>
>> +#define GUEST_TABLE_MAP_NONE 0
>> +#define GUEST_TABLE_MAP_NOMEM 1
>> +#define GUEST_TABLE_SUPER_PAGE 2
>> +#define GUEST_TABLE_NORMAL 3
> Is GUEST_ a good prefix? The guest doesn't control these tables, and the
> word could also mean the guest's own page tables.

Then P2M_ prefix should be better.

>
>> +/*
>> + * Take the currently mapped table, find the corresponding GFN entry,
> That's not what you mean though, is it? It's more like "the entry
> corresponding to the GFN" (implying "at the given level").

It will be more clear, I'll update the comment.

>
>> + * and map the next table, if available. The previous table will be
>> + * unmapped if the next level was mapped (e.g GUEST_TABLE_NORMAL
>> + * returned).
>> + *
>> + * `alloc_tbl` parameter indicates whether intermediate tables should
>> + * be allocated when not present.
>> + *
>> + * Return values:
>> + *  GUEST_TABLE_MAP_NONE: a table allocation isn't permitted.
>> + *  GUEST_TABLE_MAP_NOMEM: allocating a new page failed.
>> + *  GUEST_TABLE_SUPER_PAGE: next level or leaf mapped normally.
>> + *  GUEST_TABLE_NORMAL: The next entry points to a superpage.
>> + */
>> +static int p2m_next_level(struct p2m_domain *p2m, bool alloc_tbl,
>> +                          unsigned int level, pte_t **table,
>> +                          unsigned int offset)
>> +{
>> +    panic("%s: hasn't been implemented yet\n", __func__);
>> +
>> +    return GUEST_TABLE_MAP_NONE;
>> +}
>> +
>> +/* Free pte sub-tree behind an entry */
>> +static void p2m_free_entry(struct p2m_domain *p2m,
>> +                           pte_t entry, unsigned int level)
>> +{
>> +    panic("%s: hasn't been implemented yet\n", __func__);
>> +}
>> +
>> +/*
>> + * Insert an entry in the p2m. This should be called with a mapping
>> + * equal to a page/superpage.
>> + */
>> +static int __p2m_set_entry(struct p2m_domain *p2m,
> No double leading underscores, please. A single one is fine and will do.
>
>> +                           gfn_t sgfn,
>> +                           unsigned int page_order,
>> +                           mfn_t smfn,
> What are the "s" in "sgfn" and "smfn" indicating? Possibly "start", except
> that you don't process multiple GFNs here (unlike in the caller).

Yes, it stands for "start". I agree that is not so necessary for __p2m_set_entry()
to use "s" prefix. I'll rename them for __p2m_set_entry().

>
>> +                           p2m_type_t t,
>> +                           p2m_access_t a)
>> +{
>> +    unsigned int level;
>> +    unsigned int target = page_order / PAGETABLE_ORDER;
>> +    pte_t *entry, *table, orig_pte;
>> +    int rc;
>> +    /* A mapping is removed if the MFN is invalid. */
>> +    bool removing_mapping = mfn_eq(smfn, INVALID_MFN);
>> +    DECLARE_OFFSETS(offsets, gfn_to_gaddr(sgfn));
>> +
>> +    ASSERT(p2m_is_write_locked(p2m));
>> +
>> +    /*
>> +     * Check if the level target is valid: we only support
>> +     * 4K - 2M - 1G mapping.
>> +     */
>> +    ASSERT(target <= 2);
> No provisions towards the division that produced the value having left
> a remainder?

The way the order is initialized will always result in division without
a remainder.

If it makes sense, the|ASSERT()| could be updated to ensure that the order
is always a multiple of|PAGETABLE_ORDER|:
   ASSERT((target <= 2) && !IS_ALIGNED(page_order, PAGETABLE_ORDER));

>
>> +    table = p2m_get_root_pointer(p2m, sgfn);
>> +    if ( !table )
>> +        return -EINVAL;
>> +
>> +    for ( level = P2M_ROOT_LEVEL; level > target; level-- )
>> +    {
>> +        /*
>> +         * Don't try to allocate intermediate page table if the mapping
>> +         * is about to be removed.
>> +         */
>> +        rc = p2m_next_level(p2m, !removing_mapping,
>> +                            level, &table, offsets[level]);
>> +        if ( (rc == GUEST_TABLE_MAP_NONE) || (rc == GUEST_TABLE_MAP_NOMEM) )
>> +        {
>> +            /*
>> +             * We are here because p2m_next_level has failed to map
>> +             * the intermediate page table (e.g the table does not exist
>> +             * and they p2m tree is read-only). It is a valid case
>> +             * when removing a mapping as it may not exist in the
>> +             * page table. In this case, just ignore it.
>> +             */
>> +            rc = removing_mapping ?  0 : -ENOENT;
> Shouldn't GUEST_TABLE_MAP_NOMEM be transformed to -ENOMEM?

Maybe, but I think that it is not really necessary to be so precise here. -ENOENT
could cover both GUEST_TABLE_MAP_NONE and GUEST_TABLE_MAP_NOMEM.
Anyway. for consistency I will change this code to:
             rc = (rc == P2M_TABLE_MAP_NONE) ? -ENOENT : -ENOMEM;
             /*
              * We are here because p2m_next_level has failed to map
              * the intermediate page table (e.g the table does not exist
              * and they p2m tree is read-only). It is a valid case
              * when removing a mapping as it may not exist in the
              * page table. In this case, just ignore it.
              */
             rc = removing_mapping ?  0 : rc;
             goto out;

>> @@ -332,7 +542,55 @@ static int p2m_set_entry(struct p2m_domain *p2m,
>>                            p2m_type_t t,
>>                            p2m_access_t a)
>>   {
>> -    return -EOPNOTSUPP;
>> +    int rc = 0;
>> +
>> +    /*
>> +     * Any reference taken by the P2M mappings (e.g. foreign mapping) will
>> +     * be dropped in relinquish_p2m_mapping(). As the P2M will still
>> +     * be accessible after, we need to prevent mapping to be added when the
>> +     * domain is dying.
>> +     */
>> +    if ( unlikely(p2m->domain->is_dying) )
>> +        return -ENOMEM;
> Why ENOMEM?

I expect that when a domain is dying, it means there’s no point in using its
memory—either because it's no longer available or it has already been freed.
Basically, no memory.

>
>> +    while ( nr )
> Why's there a loop here? The function name uses singular, i.e. means to
> create exactly one entry.

I will rename the function to  p2m_set_entries().

>
>> +    {
>> +        unsigned long mask;
>> +        unsigned long order = 0;
> unsigned int?
>
>> +        /* 1gb, 2mb, 4k mappings are supported */
>> +        unsigned int i = ( P2M_ROOT_LEVEL > 2 ) ? 2 : P2M_ROOT_LEVEL;
> Not (style): Excess blanks. Yet then aren't you open-coding min() here
> anyway?

Yes, it is open-coded version of min(). I will use min() instead.

> Plus isn't P2M_ROOT_LEVEL always >= 2?

For Sv32, P2M_ROOT_LEVEL is 1; for other modes it is really always >= 2.


>
>> +        /*
>> +         * Don't take into account the MFN when removing mapping (i.e
>> +         * MFN_INVALID) to calculate the correct target order.
>> +         *
>> +         * XXX: Support superpage mappings if nr is not aligned to a
>> +         * superpage size.
>> +         */
> Does this really need leaving as a to-do?

I think so, yes. It won’t break the current workflow if|nr| isn’t aligned,
a smaller order will simply be chosen.

>
>> +        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
>> +        mask |= gfn_x(sgfn) | nr;
>> +
>> +        for ( ; i != 0; i-- )
>> +        {
>> +            if ( !(mask & (BIT(XEN_PT_LEVEL_ORDER(i), UL) - 1)) )
>> +            {
>> +                    order = XEN_PT_LEVEL_ORDER(i);
>> +                    break;
> Nit: Style.
>
>> +            }
>> +        }
>> +
>> +        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
>> +        if ( rc )
>> +            break;
>> +
>> +        sgfn = gfn_add(sgfn, (1 << order));
>> +        if ( !mfn_eq(smfn, INVALID_MFN) )
>> +           smfn = mfn_add(smfn, (1 << order));
>> +
>> +        nr -= (1 << order);
> Throughout maybe better be safe right away and use 1UL?
>
>> +    }
>> +
>> +    return rc;
>>   }
> How's the caller going to know how much of the range was successfully
> mapped?

There is no such option. Do other arches do that? I mean returns somehow
the number of successfully mapped (sgfn,smfn).

> That part may need undoing (if not here, then in the caller),
> or a caller may want to retry.

So the caller in the case if rc != 0, can just undoing the full range
(by using the same sgfn, nr, smfn).
Or, as an option, just go for range (sgfn, nr), get each entry and if it
was mapped then just clear entry; otherwise just stop.

Yes, it isn't optimal, but should work.

Thanks.

~ Oleksii

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Jan Beulich 3 months, 3 weeks ago

On 04.07.2025 17:01, Oleksii Kurochko wrote:
> On 7/1/25 3:49 PM, Jan Beulich wrote:
>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>> This patch introduces p2m_set_entry() and its core helper __p2m_set_entry() for
>>> RISC-V, based loosely on the Arm implementation, with several RISC-V-specific
>>> modifications.
>>>
>>> Key differences include:
>>> - TLB Flushing: RISC-V allows caching of invalid PTEs and does not require
>>>    break-before-make (BBM). As a result, the flushing logic is simplified.
>>>    TLB invalidation can be deferred until p2m_write_unlock() is called.
>>>    Consequently, the p2m->need_flush flag is always considered true and is
>>>    removed.
>>> - Page Table Traversal: The order of walking the page tables differs from Arm,
>>>    and this implementation reflects that reversed traversal.
>>> - Macro Adjustments: The macros P2M_ROOT_LEVEL, P2M_ROOT_ORDER, and
>>>    P2M_ROOT_PAGES are updated to align with the new RISC-V implementation.
>>>
>>> The main functionality is in __p2m_set_entry(), which handles mappings aligned
>>> to page table block entries (e.g., 1GB, 2MB, or 4KB with 4KB granularity).
>>>
>>> p2m_set_entry() breaks a region down into block-aligned mappings and calls
>>> __p2m_set_entry() accordingly.
>>>
>>> Stub implementations (to be completed later) include:
>>> - p2m_free_entry()
>> What would a function of this name do?
> 
> Recursively visiting all leaf PTE's for sub-tree behind an entry, then calls
> put_page() (which will free if there is no any reference to this page),
> freeing intermediate page table (after all entries were freed) by removing
> it from d->arch.paging.freelist, and removes correspondent page of intermediate page
> table from p2m->pages list.
> 
>> You can clear entries, but you can't
>> free them, can you?
> 
> Is is a question regarding terminology?

Yes. If one sees a call to a function, it should be possible to at least
roughly know what it does without needing to go look at the implementation.

> I can't free entry itself, but a page table or
> a page (if it is a leaf entry) on which it points could free.

Then e.g. pte_free_subtree() or some such?

>>> +static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>> The rule of thumb is to have inline functions only in header files, leaving
>> decisions to the compiler elsewhere.
> 
> I am not sure what you mean in the second part (after coma) of your sentence.

The compiler does its own inlining decisions quite fine when it can see all
call sites (as is the case for static functions). Hence in general you want
to omit "inline" there. Except of course in header files, where non-inline
static-s are a problem.

>>> +{
>>> +    panic("%s: isn't implemented for now\n", __func__);
>>> +
>>> +    return false;
>>> +}
>> For this function in particular, though: Besides the "p2me" in the name
>> being somewhat odd (supposedly page table entries here are simply pte_t),
>> how is this going to be different from pte_is_valid()?
> 
> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
> what is a type stored in the radix tree (p2m->p2m_types):
>    /*
>     * In the case of the P2M, the valid bit is used for other purpose. Use
>     * the type to check whether an entry is valid.
>     */
>    static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>    {
>        return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>    }
> 
> It is done to track which page was modified by a guest.

But then (again) the name doesn't convey what the function does. Plus
can't a guest also arrange for an entry's type to move to p2m_invalid?
That's then still an entry that was modified by the guest.

Overall I think I'm lacking clarity what you mean to use this predicate
for.

>>> +static inline void p2m_write_pte(pte_t *p, pte_t pte, bool clean_pte)
>>> +{
>>> +    write_pte(p, pte);
>>> +    if ( clean_pte )
>>> +        clean_dcache_va_range(p, sizeof(*p));
>>> +}
>>> +
>>> +static inline void p2m_remove_pte(pte_t *p, bool clean_pte)
>>> +{
>>> +    pte_t pte;
>>> +
>>> +    memset(&pte, 0x00, sizeof(pte));
>>> +    p2m_write_pte(p, pte, clean_pte);
>>> +}
>> May I suggest "clear" instead of "remove" and plain 0 instead of 0x00
>> (or simply give the variable a trivial initializer)?
> 
> Sure, I will rename and use plain 0.
> 
>>
>> As to the earlier function that I commented on: Seeing the names here,
>> wouldn't p2m_pte_is_valid() be a more consistent name there?
> 
> Then all p2me_*() should be updated to p2m_pte_*().
> 
> But initial logic was that p2me = p2m entry = p2m page table entry.
> 
> Probably we can just return back to the prefix p2m_ as based on arguments
> it is clear that it is a function for working with P2M's PTE.

In the end it's up to you. Having thought about it some more, perhaps
p2me_*() is still quite helpful to separate from functions dealing with
P2Ms as a while, and to also avoid the verbosity of p2m_pte_*().

>>> +{
>>> +    unsigned int level;
>>> +    unsigned int target = page_order / PAGETABLE_ORDER;
>>> +    pte_t *entry, *table, orig_pte;
>>> +    int rc;
>>> +    /* A mapping is removed if the MFN is invalid. */
>>> +    bool removing_mapping = mfn_eq(smfn, INVALID_MFN);
>>> +    DECLARE_OFFSETS(offsets, gfn_to_gaddr(sgfn));
>>> +
>>> +    ASSERT(p2m_is_write_locked(p2m));
>>> +
>>> +    /*
>>> +     * Check if the level target is valid: we only support
>>> +     * 4K - 2M - 1G mapping.
>>> +     */
>>> +    ASSERT(target <= 2);
>> No provisions towards the division that produced the value having left
>> a remainder?
> 
> The way the order is initialized will always result in division without
> a remainder.
> 
> If it makes sense, the|ASSERT()| could be updated to ensure that the order
> is always a multiple of|PAGETABLE_ORDER|:
>    ASSERT((target <= 2) && !IS_ALIGNED(page_order, PAGETABLE_ORDER));

Except that the ! looks wrong here.

>>> @@ -332,7 +542,55 @@ static int p2m_set_entry(struct p2m_domain *p2m,
>>>                            p2m_type_t t,
>>>                            p2m_access_t a)
>>>   {
>>> -    return -EOPNOTSUPP;
>>> +    int rc = 0;
>>> +
>>> +    /*
>>> +     * Any reference taken by the P2M mappings (e.g. foreign mapping) will
>>> +     * be dropped in relinquish_p2m_mapping(). As the P2M will still
>>> +     * be accessible after, we need to prevent mapping to be added when the
>>> +     * domain is dying.
>>> +     */
>>> +    if ( unlikely(p2m->domain->is_dying) )
>>> +        return -ENOMEM;
>> Why ENOMEM?
> 
> I expect that when a domain is dying, it means there’s no point in using its
> memory—either because it's no longer available or it has already been freed.
> Basically, no memory.

That can end up odd for call sites. Please consider using e.g. EACCES.

>>> +    while ( nr )
>> Why's there a loop here? The function name uses singular, i.e. means to
>> create exactly one entry.
> 
> I will rename the function to  p2m_set_entries().

Or maybe p2m_set_range()?

>>> +        /*
>>> +         * Don't take into account the MFN when removing mapping (i.e
>>> +         * MFN_INVALID) to calculate the correct target order.
>>> +         *
>>> +         * XXX: Support superpage mappings if nr is not aligned to a
>>> +         * superpage size.
>>> +         */
>> Does this really need leaving as a to-do?
> 
> I think so, yes. It won’t break the current workflow if|nr| isn’t aligned,
> a smaller order will simply be chosen.

Well, my question was more like "Isn't it simple enough to cover the case
right away?"

>>> +        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
>>> +        mask |= gfn_x(sgfn) | nr;
>>> +
>>> +        for ( ; i != 0; i-- )
>>> +        {
>>> +            if ( !(mask & (BIT(XEN_PT_LEVEL_ORDER(i), UL) - 1)) )
>>> +            {
>>> +                    order = XEN_PT_LEVEL_ORDER(i);
>>> +                    break;
>> Nit: Style.
>>
>>> +            }
>>> +        }
>>> +
>>> +        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
>>> +        if ( rc )
>>> +            break;
>>> +
>>> +        sgfn = gfn_add(sgfn, (1 << order));
>>> +        if ( !mfn_eq(smfn, INVALID_MFN) )
>>> +           smfn = mfn_add(smfn, (1 << order));
>>> +
>>> +        nr -= (1 << order);
>> Throughout maybe better be safe right away and use 1UL?
>>
>>> +    }
>>> +
>>> +    return rc;
>>>   }
>> How's the caller going to know how much of the range was successfully
>> mapped?
> 
> There is no such option. Do other arches do that? I mean returns somehow
> the number of successfully mapped (sgfn,smfn).

On x86 we had to introduce some not very nice code to cover for the absence
of proper handling there. For a new port I think it wants at least seriously
considering not to repeat such a potentially unhelpful pattern.

>> That part may need undoing (if not here, then in the caller),
>> or a caller may want to retry.
> 
> So the caller in the case if rc != 0, can just undoing the full range
> (by using the same sgfn, nr, smfn).

Can it? How would it know what the original state was?

Jan

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Oleksii Kurochko 3 months, 3 weeks ago

On 7/7/25 9:20 AM, Jan Beulich wrote:
> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>> This patch introduces p2m_set_entry() and its core helper __p2m_set_entry() for
>>>> RISC-V, based loosely on the Arm implementation, with several RISC-V-specific
>>>> modifications.
>>>>
>>>> Key differences include:
>>>> - TLB Flushing: RISC-V allows caching of invalid PTEs and does not require
>>>>     break-before-make (BBM). As a result, the flushing logic is simplified.
>>>>     TLB invalidation can be deferred until p2m_write_unlock() is called.
>>>>     Consequently, the p2m->need_flush flag is always considered true and is
>>>>     removed.
>>>> - Page Table Traversal: The order of walking the page tables differs from Arm,
>>>>     and this implementation reflects that reversed traversal.
>>>> - Macro Adjustments: The macros P2M_ROOT_LEVEL, P2M_ROOT_ORDER, and
>>>>     P2M_ROOT_PAGES are updated to align with the new RISC-V implementation.
>>>>
>>>> The main functionality is in __p2m_set_entry(), which handles mappings aligned
>>>> to page table block entries (e.g., 1GB, 2MB, or 4KB with 4KB granularity).
>>>>
>>>> p2m_set_entry() breaks a region down into block-aligned mappings and calls
>>>> __p2m_set_entry() accordingly.
>>>>
>>>> Stub implementations (to be completed later) include:
>>>> - p2m_free_entry()
>>> What would a function of this name do?
>> Recursively visiting all leaf PTE's for sub-tree behind an entry, then calls
>> put_page() (which will free if there is no any reference to this page),
>> freeing intermediate page table (after all entries were freed) by removing
>> it from d->arch.paging.freelist, and removes correspondent page of intermediate page
>> table from p2m->pages list.
>>
>>> You can clear entries, but you can't
>>> free them, can you?
>> Is is a question regarding terminology?
> Yes. If one sees a call to a function, it should be possible to at least
> roughly know what it does without needing to go look at the implementation.
>
>> I can't free entry itself, but a page table or
>> a page (if it is a leaf entry) on which it points could free.
> Then e.g. pte_free_subtree() or some such?

It sounds fine to me. I'll use suggested name.

Just want to notice that other arches also have the same function
for the same purpose with the same name.
Does it make sense then to change a name for all arches?

>
>>>> +static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>> The rule of thumb is to have inline functions only in header files, leaving
>>> decisions to the compiler elsewhere.
>> I am not sure what you mean in the second part (after coma) of your sentence.
> The compiler does its own inlining decisions quite fine when it can see all
> call sites (as is the case for static functions). Hence in general you want
> to omit "inline" there. Except of course in header files, where non-inline
> static-s are a problem.

Thanks, now it is clear what you meant.

>
>>>> +{
>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>> +
>>>> +    return false;
>>>> +}
>>> For this function in particular, though: Besides the "p2me" in the name
>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>> how is this going to be different from pte_is_valid()?
>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>> what is a type stored in the radix tree (p2m->p2m_types):
>>     /*
>>      * In the case of the P2M, the valid bit is used for other purpose. Use
>>      * the type to check whether an entry is valid.
>>      */
>>     static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>     {
>>         return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>     }
>>
>> It is done to track which page was modified by a guest.
> But then (again) the name doesn't convey what the function does.

Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.

>   Plus
> can't a guest also arrange for an entry's type to move to p2m_invalid?
> That's then still an entry that was modified by the guest.

I am not really sure that I fully understand the question.
Do you ask if a guest can do something which will lead to a call of p2m_set_entry()
with p2m_invalid argument?
If yes, then it seems like it will be done only in case of p2m_remove_mapping() what
will mean that alongside with p2m_invalid INVALID_MFN will be also passed, what means
this entry isn't expected to be used anymore.

> Overall I think I'm lacking clarity what you mean to use this predicate
> for.

By using of "p2me_" predicate I wanted to express that not PTE's valid bit will be
checked, but the type saved in radix tree will be used.
As suggested above probably it will be better drop "e" too and just use p2m_type_is_valid().

>
>>>> +static inline void p2m_write_pte(pte_t *p, pte_t pte, bool clean_pte)
>>>> +{
>>>> +    write_pte(p, pte);
>>>> +    if ( clean_pte )
>>>> +        clean_dcache_va_range(p, sizeof(*p));
>>>> +}
>>>> +
>>>> +static inline void p2m_remove_pte(pte_t *p, bool clean_pte)
>>>> +{
>>>> +    pte_t pte;
>>>> +
>>>> +    memset(&pte, 0x00, sizeof(pte));
>>>> +    p2m_write_pte(p, pte, clean_pte);
>>>> +}
>>> May I suggest "clear" instead of "remove" and plain 0 instead of 0x00
>>> (or simply give the variable a trivial initializer)?
>> Sure, I will rename and use plain 0.
>>
>>> As to the earlier function that I commented on: Seeing the names here,
>>> wouldn't p2m_pte_is_valid() be a more consistent name there?
>> Then all p2me_*() should be updated to p2m_pte_*().
>>
>> But initial logic was that p2me = p2m entry = p2m page table entry.
>>
>> Probably we can just return back to the prefix p2m_ as based on arguments
>> it is clear that it is a function for working with P2M's PTE.
> In the end it's up to you. Having thought about it some more, perhaps
> p2me_*() is still quite helpful to separate from functions dealing with
> P2Ms as a while, and to also avoid the verbosity of p2m_pte_*().
>
>>>> +{
>>>> +    unsigned int level;
>>>> +    unsigned int target = page_order / PAGETABLE_ORDER;
>>>> +    pte_t *entry, *table, orig_pte;
>>>> +    int rc;
>>>> +    /* A mapping is removed if the MFN is invalid. */
>>>> +    bool removing_mapping = mfn_eq(smfn, INVALID_MFN);
>>>> +    DECLARE_OFFSETS(offsets, gfn_to_gaddr(sgfn));
>>>> +
>>>> +    ASSERT(p2m_is_write_locked(p2m));
>>>> +
>>>> +    /*
>>>> +     * Check if the level target is valid: we only support
>>>> +     * 4K - 2M - 1G mapping.
>>>> +     */
>>>> +    ASSERT(target <= 2);
>>> No provisions towards the division that produced the value having left
>>> a remainder?
>> The way the order is initialized will always result in division without
>> a remainder.
>>
>> If it makes sense, the|ASSERT()| could be updated to ensure that the order
>> is always a multiple of|PAGETABLE_ORDER|:
>>     ASSERT((target <= 2) && !IS_ALIGNED(page_order, PAGETABLE_ORDER));
> Except that the ! looks wrong here.

Agree, it shouldn't be here. Thanks.

>
>>>> @@ -332,7 +542,55 @@ static int p2m_set_entry(struct p2m_domain *p2m,
>>>>                             p2m_type_t t,
>>>>                             p2m_access_t a)
>>>>    {
>>>> -    return -EOPNOTSUPP;
>>>> +    int rc = 0;
>>>> +
>>>> +    /*
>>>> +     * Any reference taken by the P2M mappings (e.g. foreign mapping) will
>>>> +     * be dropped in relinquish_p2m_mapping(). As the P2M will still
>>>> +     * be accessible after, we need to prevent mapping to be added when the
>>>> +     * domain is dying.
>>>> +     */
>>>> +    if ( unlikely(p2m->domain->is_dying) )
>>>> +        return -ENOMEM;
>>> Why ENOMEM?
>> I expect that when a domain is dying, it means there’s no point in using its
>> memory—either because it's no longer available or it has already been freed.
>> Basically, no memory.
> That can end up odd for call sites. Please consider using e.g. EACCES.
>
>>>> +    while ( nr )
>>> Why's there a loop here? The function name uses singular, i.e. means to
>>> create exactly one entry.
>> I will rename the function to  p2m_set_entries().
> Or maybe p2m_set_range()?

It is much better.

>
>>>> +        /*
>>>> +         * Don't take into account the MFN when removing mapping (i.e
>>>> +         * MFN_INVALID) to calculate the correct target order.
>>>> +         *
>>>> +         * XXX: Support superpage mappings if nr is not aligned to a
>>>> +         * superpage size.
>>>> +         */
>>> Does this really need leaving as a to-do?
>> I think so, yes. It won’t break the current workflow if|nr| isn’t aligned,
>> a smaller order will simply be chosen.
> Well, my question was more like "Isn't it simple enough to cover the case
> right away?"
>
>>>> +        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
>>>> +        mask |= gfn_x(sgfn) | nr;
>>>> +
>>>> +        for ( ; i != 0; i-- )
>>>> +        {
>>>> +            if ( !(mask & (BIT(XEN_PT_LEVEL_ORDER(i), UL) - 1)) )
>>>> +            {
>>>> +                    order = XEN_PT_LEVEL_ORDER(i);
>>>> +                    break;
>>> Nit: Style.
>>>
>>>> +            }
>>>> +        }
>>>> +
>>>> +        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
>>>> +        if ( rc )
>>>> +            break;
>>>> +
>>>> +        sgfn = gfn_add(sgfn, (1 << order));
>>>> +        if ( !mfn_eq(smfn, INVALID_MFN) )
>>>> +           smfn = mfn_add(smfn, (1 << order));
>>>> +
>>>> +        nr -= (1 << order);
>>> Throughout maybe better be safe right away and use 1UL?
>>>
>>>> +    }
>>>> +
>>>> +    return rc;
>>>>    }
>>> How's the caller going to know how much of the range was successfully
>>> mapped?
>> There is no such option. Do other arches do that? I mean returns somehow
>> the number of successfully mapped (sgfn,smfn).
> On x86 we had to introduce some not very nice code to cover for the absence
> of proper handling there. For a new port I think it wants at least seriously
> considering not to repeat such a potentially unhelpful pattern.
>
>>> That part may need undoing (if not here, then in the caller),
>>> or a caller may want to retry.
>> So the caller in the case if rc != 0, can just undoing the full range
>> (by using the same sgfn, nr, smfn).
> Can it? How would it know what the original state was?

You're right — blindly unmapping the range assumes that no entries were valid
beforehand and I missed that it could be that something valid was mapped before
p2m_set_entry(sgfn,...,smfn) was called.
But then I am not really understand why it won't be an issue if will know
how many GFNs were successfully mapped.

~ Oleksii

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Jan Beulich 3 months, 3 weeks ago

On 07.07.2025 13:46, Oleksii Kurochko wrote:
> On 7/7/25 9:20 AM, Jan Beulich wrote:
>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>> This patch introduces p2m_set_entry() and its core helper __p2m_set_entry() for
>>>>> RISC-V, based loosely on the Arm implementation, with several RISC-V-specific
>>>>> modifications.
>>>>>
>>>>> Key differences include:
>>>>> - TLB Flushing: RISC-V allows caching of invalid PTEs and does not require
>>>>>     break-before-make (BBM). As a result, the flushing logic is simplified.
>>>>>     TLB invalidation can be deferred until p2m_write_unlock() is called.
>>>>>     Consequently, the p2m->need_flush flag is always considered true and is
>>>>>     removed.
>>>>> - Page Table Traversal: The order of walking the page tables differs from Arm,
>>>>>     and this implementation reflects that reversed traversal.
>>>>> - Macro Adjustments: The macros P2M_ROOT_LEVEL, P2M_ROOT_ORDER, and
>>>>>     P2M_ROOT_PAGES are updated to align with the new RISC-V implementation.
>>>>>
>>>>> The main functionality is in __p2m_set_entry(), which handles mappings aligned
>>>>> to page table block entries (e.g., 1GB, 2MB, or 4KB with 4KB granularity).
>>>>>
>>>>> p2m_set_entry() breaks a region down into block-aligned mappings and calls
>>>>> __p2m_set_entry() accordingly.
>>>>>
>>>>> Stub implementations (to be completed later) include:
>>>>> - p2m_free_entry()
>>>> What would a function of this name do?
>>> Recursively visiting all leaf PTE's for sub-tree behind an entry, then calls
>>> put_page() (which will free if there is no any reference to this page),
>>> freeing intermediate page table (after all entries were freed) by removing
>>> it from d->arch.paging.freelist, and removes correspondent page of intermediate page
>>> table from p2m->pages list.
>>>
>>>> You can clear entries, but you can't
>>>> free them, can you?
>>> Is is a question regarding terminology?
>> Yes. If one sees a call to a function, it should be possible to at least
>> roughly know what it does without needing to go look at the implementation.
>>
>>> I can't free entry itself, but a page table or
>>> a page (if it is a leaf entry) on which it points could free.
>> Then e.g. pte_free_subtree() or some such?
> 
> It sounds fine to me. I'll use suggested name.
> 
> Just want to notice that other arches also have the same function
> for the same purpose with the same name.

As to x86, it's not general P2M code which uses this odd (for the purpose)
name, but only p2m-pt.c.

> Does it make sense then to change a name for all arches?

I think so.

>>>>> +{
>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>> +
>>>>> +    return false;
>>>>> +}
>>>> For this function in particular, though: Besides the "p2me" in the name
>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>> how is this going to be different from pte_is_valid()?
>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>     /*
>>>      * In the case of the P2M, the valid bit is used for other purpose. Use
>>>      * the type to check whether an entry is valid.
>>>      */
>>>     static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>     {
>>>         return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>     }
>>>
>>> It is done to track which page was modified by a guest.
>> But then (again) the name doesn't convey what the function does.
> 
> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.

For P2M type checks please don't invent new naming, but use what both x86
and Arm are already using. Note how we already have p2m_is_valid() in that
set. Just that it's not doing what you want here.

>>   Plus
>> can't a guest also arrange for an entry's type to move to p2m_invalid?
>> That's then still an entry that was modified by the guest.
> 
> I am not really sure that I fully understand the question.
> Do you ask if a guest can do something which will lead to a call of p2m_set_entry()
> with p2m_invalid argument?

That I'm not asking, but rather stating. I.e. I expect such is possible.

> If yes, then it seems like it will be done only in case of p2m_remove_mapping() what
> will mean that alongside with p2m_invalid INVALID_MFN will be also passed, what means
> this entry isn't expected to be used anymore.

Right. But such an entry would still have been "modified" by the guest.

>> Overall I think I'm lacking clarity what you mean to use this predicate
>> for.
> 
> By using of "p2me_" predicate I wanted to express that not PTE's valid bit will be
> checked, but the type saved in radix tree will be used.
> As suggested above probably it will be better drop "e" too and just use p2m_type_is_valid().

See above regarding that name.

>>>>> +        /*
>>>>> +         * Don't take into account the MFN when removing mapping (i.e
>>>>> +         * MFN_INVALID) to calculate the correct target order.
>>>>> +         *
>>>>> +         * XXX: Support superpage mappings if nr is not aligned to a
>>>>> +         * superpage size.
>>>>> +         */
>>>> Does this really need leaving as a to-do?
>>> I think so, yes. It won’t break the current workflow if|nr| isn’t aligned,
>>> a smaller order will simply be chosen.
>> Well, my question was more like "Isn't it simple enough to cover the case
>> right away?"
>>
>>>>> +        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
>>>>> +        mask |= gfn_x(sgfn) | nr;
>>>>> +
>>>>> +        for ( ; i != 0; i-- )
>>>>> +        {
>>>>> +            if ( !(mask & (BIT(XEN_PT_LEVEL_ORDER(i), UL) - 1)) )
>>>>> +            {
>>>>> +                    order = XEN_PT_LEVEL_ORDER(i);
>>>>> +                    break;
>>>> Nit: Style.
>>>>
>>>>> +            }
>>>>> +        }
>>>>> +
>>>>> +        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
>>>>> +        if ( rc )
>>>>> +            break;
>>>>> +
>>>>> +        sgfn = gfn_add(sgfn, (1 << order));
>>>>> +        if ( !mfn_eq(smfn, INVALID_MFN) )
>>>>> +           smfn = mfn_add(smfn, (1 << order));
>>>>> +
>>>>> +        nr -= (1 << order);
>>>> Throughout maybe better be safe right away and use 1UL?
>>>>
>>>>> +    }
>>>>> +
>>>>> +    return rc;
>>>>>    }
>>>> How's the caller going to know how much of the range was successfully
>>>> mapped?
>>> There is no such option. Do other arches do that? I mean returns somehow
>>> the number of successfully mapped (sgfn,smfn).
>> On x86 we had to introduce some not very nice code to cover for the absence
>> of proper handling there. For a new port I think it wants at least seriously
>> considering not to repeat such a potentially unhelpful pattern.
>>
>>>> That part may need undoing (if not here, then in the caller),
>>>> or a caller may want to retry.
>>> So the caller in the case if rc != 0, can just undoing the full range
>>> (by using the same sgfn, nr, smfn).
>> Can it? How would it know what the original state was?
> 
> You're right — blindly unmapping the range assumes that no entries were valid
> beforehand and I missed that it could be that something valid was mapped before
> p2m_set_entry(sgfn,...,smfn) was called.
> But then I am not really understand why it won't be an issue if will know
> how many GFNs were successfully mapped.

The caller may know what that range's state was. But what I really wanted to
convey is: Updating multiple entries in one go is complicated in some of the
corner cases. You will want to think this through now, in order to avoid the
need to re-write everything later again.

Jan

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Oleksii Kurochko 3 months, 3 weeks ago

On 7/7/25 2:53 PM, Jan Beulich wrote:
> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>> This patch introduces p2m_set_entry() and its core helper __p2m_set_entry() for
>>>>>> RISC-V, based loosely on the Arm implementation, with several RISC-V-specific
>>>>>> modifications.
>>>>>>
>>>>>> Key differences include:
>>>>>> - TLB Flushing: RISC-V allows caching of invalid PTEs and does not require
>>>>>>      break-before-make (BBM). As a result, the flushing logic is simplified.
>>>>>>      TLB invalidation can be deferred until p2m_write_unlock() is called.
>>>>>>      Consequently, the p2m->need_flush flag is always considered true and is
>>>>>>      removed.
>>>>>> - Page Table Traversal: The order of walking the page tables differs from Arm,
>>>>>>      and this implementation reflects that reversed traversal.
>>>>>> - Macro Adjustments: The macros P2M_ROOT_LEVEL, P2M_ROOT_ORDER, and
>>>>>>      P2M_ROOT_PAGES are updated to align with the new RISC-V implementation.
>>>>>>
>>>>>> The main functionality is in __p2m_set_entry(), which handles mappings aligned
>>>>>> to page table block entries (e.g., 1GB, 2MB, or 4KB with 4KB granularity).
>>>>>>
>>>>>> p2m_set_entry() breaks a region down into block-aligned mappings and calls
>>>>>> __p2m_set_entry() accordingly.
>>>>>>
>>>>>> Stub implementations (to be completed later) include:
>>>>>> - p2m_free_entry()
>>>>> What would a function of this name do?
>>>> Recursively visiting all leaf PTE's for sub-tree behind an entry, then calls
>>>> put_page() (which will free if there is no any reference to this page),
>>>> freeing intermediate page table (after all entries were freed) by removing
>>>> it from d->arch.paging.freelist, and removes correspondent page of intermediate page
>>>> table from p2m->pages list.
>>>>
>>>>> You can clear entries, but you can't
>>>>> free them, can you?
>>>> Is is a question regarding terminology?
>>> Yes. If one sees a call to a function, it should be possible to at least
>>> roughly know what it does without needing to go look at the implementation.
>>>
>>>> I can't free entry itself, but a page table or
>>>> a page (if it is a leaf entry) on which it points could free.
>>> Then e.g. pte_free_subtree() or some such?
>> It sounds fine to me. I'll use suggested name.
>>
>> Just want to notice that other arches also have the same function
>> for the same purpose with the same name.
> As to x86, it's not general P2M code which uses this odd (for the purpose)
> name, but only p2m-pt.c.
>
>> Does it make sense then to change a name for all arches?
> I think so.
>
>>>>>> +{
>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>> +
>>>>>> +    return false;
>>>>>> +}
>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>> how is this going to be different from pte_is_valid()?
>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>      /*
>>>>       * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>       * the type to check whether an entry is valid.
>>>>       */
>>>>      static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>      {
>>>>          return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>      }
>>>>
>>>> It is done to track which page was modified by a guest.
>>> But then (again) the name doesn't convey what the function does.
>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
> For P2M type checks please don't invent new naming, but use what both x86
> and Arm are already using. Note how we already have p2m_is_valid() in that
> set. Just that it's not doing what you want here.

Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
And in here it is checked if P2M pte is valid from P2M point of view by checking
the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
free bits for type).

>
>>>    Plus
>>> can't a guest also arrange for an entry's type to move to p2m_invalid?
>>> That's then still an entry that was modified by the guest.
>> I am not really sure that I fully understand the question.
>> Do you ask if a guest can do something which will lead to a call of p2m_set_entry()
>> with p2m_invalid argument?
> That I'm not asking, but rather stating. I.e. I expect such is possible.
>
>> If yes, then it seems like it will be done only in case of p2m_remove_mapping() what
>> will mean that alongside with p2m_invalid INVALID_MFN will be also passed, what means
>> this entry isn't expected to be used anymore.
> Right. But such an entry would still have been "modified" by the guest.

Yes, but nothing then is needed to do with it. For example, if it is already invalid there
is not any sense to flush page to RAM (as in this case PTE's bit will be checked),
something like Arm does:
   https://elixir.bootlin.com/xen/v4.20.0/source/xen/arch/arm/p2m.c#L375

>>>>>> +        /*
>>>>>> +         * Don't take into account the MFN when removing mapping (i.e
>>>>>> +         * MFN_INVALID) to calculate the correct target order.
>>>>>> +         *
>>>>>> +         * XXX: Support superpage mappings if nr is not aligned to a
>>>>>> +         * superpage size.
>>>>>> +         */
>>>>> Does this really need leaving as a to-do?
>>>> I think so, yes. It won’t break the current workflow if|nr| isn’t aligned,
>>>> a smaller order will simply be chosen.
>>> Well, my question was more like "Isn't it simple enough to cover the case
>>> right away?"
>>>
>>>>>> +        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
>>>>>> +        mask |= gfn_x(sgfn) | nr;
>>>>>> +
>>>>>> +        for ( ; i != 0; i-- )
>>>>>> +        {
>>>>>> +            if ( !(mask & (BIT(XEN_PT_LEVEL_ORDER(i), UL) - 1)) )
>>>>>> +            {
>>>>>> +                    order = XEN_PT_LEVEL_ORDER(i);
>>>>>> +                    break;
>>>>> Nit: Style.
>>>>>
>>>>>> +            }
>>>>>> +        }
>>>>>> +
>>>>>> +        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
>>>>>> +        if ( rc )
>>>>>> +            break;
>>>>>> +
>>>>>> +        sgfn = gfn_add(sgfn, (1 << order));
>>>>>> +        if ( !mfn_eq(smfn, INVALID_MFN) )
>>>>>> +           smfn = mfn_add(smfn, (1 << order));
>>>>>> +
>>>>>> +        nr -= (1 << order);
>>>>> Throughout maybe better be safe right away and use 1UL?
>>>>>
>>>>>> +    }
>>>>>> +
>>>>>> +    return rc;
>>>>>>     }
>>>>> How's the caller going to know how much of the range was successfully
>>>>> mapped?
>>>> There is no such option. Do other arches do that? I mean returns somehow
>>>> the number of successfully mapped (sgfn,smfn).
>>> On x86 we had to introduce some not very nice code to cover for the absence
>>> of proper handling there. For a new port I think it wants at least seriously
>>> considering not to repeat such a potentially unhelpful pattern.
>>>
>>>>> That part may need undoing (if not here, then in the caller),
>>>>> or a caller may want to retry.
>>>> So the caller in the case if rc != 0, can just undoing the full range
>>>> (by using the same sgfn, nr, smfn).
>>> Can it? How would it know what the original state was?
>> You're right — blindly unmapping the range assumes that no entries were valid
>> beforehand and I missed that it could be that something valid was mapped before
>> p2m_set_entry(sgfn,...,smfn) was called.
>> But then I am not really understand why it won't be an issue if will know
>> how many GFNs were successfully mapped.
> The caller may know what that range's state was. But what I really wanted to
> convey is: Updating multiple entries in one go is complicated in some of the
> corner cases. You will want to think this through now, in order to avoid the
> need to re-write everything later again.

I can add one more argument to return the number of successfully mapped GFNs.
Fortunately, that's very easy to do.

The problem for me is that I don’t really understand what the caller is supposed
to do with that information. The only use case I can think of is that the caller
might try to map the remaining GFNs again. But that doesn’t seem very useful,
if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
issue, and retrying would probably result in the same error.

The same applies to rolling back the state. It wouldn’t be difficult to add a local
array to track all modified PTEs and then use it to revert the state if needed.
But again, what would the caller do after the rollback? At this point, it still seems
like the best option is simply to|panic(). |

Basically, I don’t see or understand the cases where knowing how many GFNs were
successfully mapped, or whether a rollback was performed, would really help — because
in most cases, I don’t have a better option than just calling|panic()| at the end.

For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
tree node, and the mapping fails partway through, I’m left with two options: either
ignore the device (if it's not essential for Xen or guest functionality) and continue
  booting; in which case I’d need to perform a rollback, and simply knowing the number
of successfully mapped GFNs may not be enough or, more likely, just panic.

Are there any realistic use cases where knowing the number of mapped GFNs or having
rollback support would actually allow us to avoid a panic?

Even more so, how would that information be used in the current call chain?
We have the following chain:
  |map_regions_p2mt()| →|p2m_insert()| →|p2m_set_entry()|

If|p2m_set_entry()| returns the number of successfully mapped GFNs, what should
|p2m_insert()| do with it — process it further, or just pass it along to
|map_regions_p2mt()|?

Thanks in advance for clarifications.
|~ Oleksii |

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Jan Beulich 3 months, 3 weeks ago

On 07.07.2025 17:00, Oleksii Kurochko wrote:
> On 7/7/25 2:53 PM, Jan Beulich wrote:
>> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>> +{
>>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>>> +
>>>>>>> +    return false;
>>>>>>> +}
>>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>>> how is this going to be different from pte_is_valid()?
>>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>>      /*
>>>>>       * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>>       * the type to check whether an entry is valid.
>>>>>       */
>>>>>      static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>>      {
>>>>>          return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>>      }
>>>>>
>>>>> It is done to track which page was modified by a guest.
>>>> But then (again) the name doesn't convey what the function does.
>>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
>> For P2M type checks please don't invent new naming, but use what both x86
>> and Arm are already using. Note how we already have p2m_is_valid() in that
>> set. Just that it's not doing what you want here.
> 
> Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
> And in here it is checked if P2M pte is valid from P2M point of view by checking
> the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
> free bits for type).

Because this is how it's defined on x86:

#define p2m_is_valid(_t)    (p2m_to_mask(_t) & \
                             (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct)))

I.e. more strict that simply "!= p2m_invalid". And I think such predicates
would better be uniform across architectures, such that in principle they
might also be usable in common code (as we already do with p2m_is_foreign()).

>>>>    Plus
>>>> can't a guest also arrange for an entry's type to move to p2m_invalid?
>>>> That's then still an entry that was modified by the guest.
>>> I am not really sure that I fully understand the question.
>>> Do you ask if a guest can do something which will lead to a call of p2m_set_entry()
>>> with p2m_invalid argument?
>> That I'm not asking, but rather stating. I.e. I expect such is possible.
>>
>>> If yes, then it seems like it will be done only in case of p2m_remove_mapping() what
>>> will mean that alongside with p2m_invalid INVALID_MFN will be also passed, what means
>>> this entry isn't expected to be used anymore.
>> Right. But such an entry would still have been "modified" by the guest.
> 
> Yes, but nothing then is needed to do with it.

I understand that. Maybe I'm overly picky, but all of the above was in response
to you saying "It is done to track which page was modified by a guest." And I'm
simply trying to get you to use precise wording, both in code comments and in
discussions. In a case like the one here I simply can't judge whether you simply
expressed yourself not clear enough, or whether you indeed meant what you said.

>>>>>>> +        /*
>>>>>>> +         * Don't take into account the MFN when removing mapping (i.e
>>>>>>> +         * MFN_INVALID) to calculate the correct target order.
>>>>>>> +         *
>>>>>>> +         * XXX: Support superpage mappings if nr is not aligned to a
>>>>>>> +         * superpage size.
>>>>>>> +         */
>>>>>> Does this really need leaving as a to-do?
>>>>> I think so, yes. It won’t break the current workflow if|nr| isn’t aligned,
>>>>> a smaller order will simply be chosen.
>>>> Well, my question was more like "Isn't it simple enough to cover the case
>>>> right away?"
>>>>
>>>>>>> +        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
>>>>>>> +        mask |= gfn_x(sgfn) | nr;
>>>>>>> +
>>>>>>> +        for ( ; i != 0; i-- )
>>>>>>> +        {
>>>>>>> +            if ( !(mask & (BIT(XEN_PT_LEVEL_ORDER(i), UL) - 1)) )
>>>>>>> +            {
>>>>>>> +                    order = XEN_PT_LEVEL_ORDER(i);
>>>>>>> +                    break;
>>>>>> Nit: Style.
>>>>>>
>>>>>>> +            }
>>>>>>> +        }
>>>>>>> +
>>>>>>> +        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
>>>>>>> +        if ( rc )
>>>>>>> +            break;
>>>>>>> +
>>>>>>> +        sgfn = gfn_add(sgfn, (1 << order));
>>>>>>> +        if ( !mfn_eq(smfn, INVALID_MFN) )
>>>>>>> +           smfn = mfn_add(smfn, (1 << order));
>>>>>>> +
>>>>>>> +        nr -= (1 << order);
>>>>>> Throughout maybe better be safe right away and use 1UL?
>>>>>>
>>>>>>> +    }
>>>>>>> +
>>>>>>> +    return rc;
>>>>>>>     }
>>>>>> How's the caller going to know how much of the range was successfully
>>>>>> mapped?
>>>>> There is no such option. Do other arches do that? I mean returns somehow
>>>>> the number of successfully mapped (sgfn,smfn).
>>>> On x86 we had to introduce some not very nice code to cover for the absence
>>>> of proper handling there. For a new port I think it wants at least seriously
>>>> considering not to repeat such a potentially unhelpful pattern.
>>>>
>>>>>> That part may need undoing (if not here, then in the caller),
>>>>>> or a caller may want to retry.
>>>>> So the caller in the case if rc != 0, can just undoing the full range
>>>>> (by using the same sgfn, nr, smfn).
>>>> Can it? How would it know what the original state was?
>>> You're right — blindly unmapping the range assumes that no entries were valid
>>> beforehand and I missed that it could be that something valid was mapped before
>>> p2m_set_entry(sgfn,...,smfn) was called.
>>> But then I am not really understand why it won't be an issue if will know
>>> how many GFNs were successfully mapped.
>> The caller may know what that range's state was. But what I really wanted to
>> convey is: Updating multiple entries in one go is complicated in some of the
>> corner cases. You will want to think this through now, in order to avoid the
>> need to re-write everything later again.
> 
> I can add one more argument to return the number of successfully mapped GFNs.
> Fortunately, that's very easy to do.
> 
> The problem for me is that I don’t really understand what the caller is supposed
> to do with that information.

That's only the 2nd step to take. The first is: What behavior do you want, overall?

> The only use case I can think of is that the caller
> might try to map the remaining GFNs again. But that doesn’t seem very useful,
> if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
> issue, and retrying would probably result in the same error.
> 
> The same applies to rolling back the state. It wouldn’t be difficult to add a local
> array to track all modified PTEs and then use it to revert the state if needed.
> But again, what would the caller do after the rollback? At this point, it still seems
> like the best option is simply to|panic(). |
> 
> Basically, I don’t see or understand the cases where knowing how many GFNs were
> successfully mapped, or whether a rollback was performed, would really help — because
> in most cases, I don’t have a better option than just calling|panic()| at the end.

panic()-ing is of course only a last resort. Anything related to domain handling
would better crash only the domain in question. And even that only if suitable
error handling isn't possible.

> For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
> tree node, and the mapping fails partway through, I’m left with two options: either
> ignore the device (if it's not essential for Xen or guest functionality) and continue
>   booting; in which case I’d need to perform a rollback, and simply knowing the number
> of successfully mapped GFNs may not be enough or, more likely, just panic.

Well, no. For example, before even trying to map you could check that the range
of P2M entries covered is all empty. _Then_ you know how to correctly roll back.
And yes, doing so may not even require passing back information on how much of
a region was successfully mapped.

Jan

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Oleksii Kurochko 3 months, 3 weeks ago

On 7/7/25 5:15 PM, Jan Beulich wrote:
> On 07.07.2025 17:00, Oleksii Kurochko wrote:
>> On 7/7/25 2:53 PM, Jan Beulich wrote:
>>> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>>>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>> +{
>>>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>>>> +
>>>>>>>> +    return false;
>>>>>>>> +}
>>>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>>>> how is this going to be different from pte_is_valid()?
>>>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>>>       /*
>>>>>>        * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>>>        * the type to check whether an entry is valid.
>>>>>>        */
>>>>>>       static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>>>       {
>>>>>>           return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>>>       }
>>>>>>
>>>>>> It is done to track which page was modified by a guest.
>>>>> But then (again) the name doesn't convey what the function does.
>>>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
>>> For P2M type checks please don't invent new naming, but use what both x86
>>> and Arm are already using. Note how we already have p2m_is_valid() in that
>>> set. Just that it's not doing what you want here.
>> Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
>> And in here it is checked if P2M pte is valid from P2M point of view by checking
>> the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
>> free bits for type).
> Because this is how it's defined on x86:
>
> #define p2m_is_valid(_t)    (p2m_to_mask(_t) & \
>                               (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct)))
>
> I.e. more strict that simply "!= p2m_invalid". And I think such predicates
> would better be uniform across architectures, such that in principle they
> might also be usable in common code (as we already do with p2m_is_foreign()).

Yeah, Arm isn't so strict in definition of p2m_is_valid() and it seems like
x86 and Arm have different understanding what is valid.

Except what mentioned in the comment that grant types aren't considered valid
for x86 (and shouldn't be the same then for Arm?), it isn't clear why x86's
p2m_is_valid() is stricter then Arm's one and if other arches should be also
so strict.
It seems like from the point of view of mapping/unmapping it is enough just
to verify a "copy" of PTE's valid bit (in terms of P2M it is p2m_invalid type).

>
>>>>>     Plus
>>>>> can't a guest also arrange for an entry's type to move to p2m_invalid?
>>>>> That's then still an entry that was modified by the guest.
>>>> I am not really sure that I fully understand the question.
>>>> Do you ask if a guest can do something which will lead to a call of p2m_set_entry()
>>>> with p2m_invalid argument?
>>> That I'm not asking, but rather stating. I.e. I expect such is possible.
>>>
>>>> If yes, then it seems like it will be done only in case of p2m_remove_mapping() what
>>>> will mean that alongside with p2m_invalid INVALID_MFN will be also passed, what means
>>>> this entry isn't expected to be used anymore.
>>> Right. But such an entry would still have been "modified" by the guest.
>> Yes, but nothing then is needed to do with it.
> I understand that. Maybe I'm overly picky, but all of the above was in response
> to you saying "It is done to track which page was modified by a guest." And I'm
> simply trying to get you to use precise wording, both in code comments and in
> discussions. In a case like the one here I simply can't judge whether you simply
> expressed yourself not clear enough, or whether you indeed meant what you said.
>
>>>>>>>> +        /*
>>>>>>>> +         * Don't take into account the MFN when removing mapping (i.e
>>>>>>>> +         * MFN_INVALID) to calculate the correct target order.
>>>>>>>> +         *
>>>>>>>> +         * XXX: Support superpage mappings if nr is not aligned to a
>>>>>>>> +         * superpage size.
>>>>>>>> +         */
>>>>>>> Does this really need leaving as a to-do?
>>>>>> I think so, yes. It won’t break the current workflow if|nr| isn’t aligned,
>>>>>> a smaller order will simply be chosen.
>>>>> Well, my question was more like "Isn't it simple enough to cover the case
>>>>> right away?"
>>>>>
>>>>>>>> +        mask = !mfn_eq(smfn, INVALID_MFN) ? mfn_x(smfn) : 0;
>>>>>>>> +        mask |= gfn_x(sgfn) | nr;
>>>>>>>> +
>>>>>>>> +        for ( ; i != 0; i-- )
>>>>>>>> +        {
>>>>>>>> +            if ( !(mask & (BIT(XEN_PT_LEVEL_ORDER(i), UL) - 1)) )
>>>>>>>> +            {
>>>>>>>> +                    order = XEN_PT_LEVEL_ORDER(i);
>>>>>>>> +                    break;
>>>>>>> Nit: Style.
>>>>>>>
>>>>>>>> +            }
>>>>>>>> +        }
>>>>>>>> +
>>>>>>>> +        rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
>>>>>>>> +        if ( rc )
>>>>>>>> +            break;
>>>>>>>> +
>>>>>>>> +        sgfn = gfn_add(sgfn, (1 << order));
>>>>>>>> +        if ( !mfn_eq(smfn, INVALID_MFN) )
>>>>>>>> +           smfn = mfn_add(smfn, (1 << order));
>>>>>>>> +
>>>>>>>> +        nr -= (1 << order);
>>>>>>> Throughout maybe better be safe right away and use 1UL?
>>>>>>>
>>>>>>>> +    }
>>>>>>>> +
>>>>>>>> +    return rc;
>>>>>>>>      }
>>>>>>> How's the caller going to know how much of the range was successfully
>>>>>>> mapped?
>>>>>> There is no such option. Do other arches do that? I mean returns somehow
>>>>>> the number of successfully mapped (sgfn,smfn).
>>>>> On x86 we had to introduce some not very nice code to cover for the absence
>>>>> of proper handling there. For a new port I think it wants at least seriously
>>>>> considering not to repeat such a potentially unhelpful pattern.
>>>>>
>>>>>>> That part may need undoing (if not here, then in the caller),
>>>>>>> or a caller may want to retry.
>>>>>> So the caller in the case if rc != 0, can just undoing the full range
>>>>>> (by using the same sgfn, nr, smfn).
>>>>> Can it? How would it know what the original state was?
>>>> You're right — blindly unmapping the range assumes that no entries were valid
>>>> beforehand and I missed that it could be that something valid was mapped before
>>>> p2m_set_entry(sgfn,...,smfn) was called.
>>>> But then I am not really understand why it won't be an issue if will know
>>>> how many GFNs were successfully mapped.
>>> The caller may know what that range's state was. But what I really wanted to
>>> convey is: Updating multiple entries in one go is complicated in some of the
>>> corner cases. You will want to think this through now, in order to avoid the
>>> need to re-write everything later again.
>> I can add one more argument to return the number of successfully mapped GFNs.
>> Fortunately, that's very easy to do.
>>
>> The problem for me is that I don’t really understand what the caller is supposed
>> to do with that information.
> That's only the 2nd step to take. The first is: What behavior do you want, overall?

My initial idea was that if something went wrong ( rc != 0 ) then just panic(). But
based on your questions it seems like it isn't the best one idea.

>
>> The only use case I can think of is that the caller
>> might try to map the remaining GFNs again. But that doesn’t seem very useful,
>> if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
>> issue, and retrying would probably result in the same error.
>>
>> The same applies to rolling back the state. It wouldn’t be difficult to add a local
>> array to track all modified PTEs and then use it to revert the state if needed.
>> But again, what would the caller do after the rollback? At this point, it still seems
>> like the best option is simply to|panic(). |
>>
>> Basically, I don’t see or understand the cases where knowing how many GFNs were
>> successfully mapped, or whether a rollback was performed, would really help — because
>> in most cases, I don’t have a better option than just calling|panic()| at the end.
> panic()-ing is of course only a last resort. Anything related to domain handling
> would better crash only the domain in question. And even that only if suitable
> error handling isn't possible.

And if there is no still any runnable domain available, for example, we are creating
domain and some p2m mapping is called? Will it be enough just ignore to boot this domain?
If yes, then it is enough to return only error code without returning how many GFNs were
mapped or rollbacking as domain won't be ran anyway.
(just to mention, I am not trying to convince you that rollback or returning of an amount
of GFNs isn't necessary, I just trying to understand what is the best implementation of
handling none-fully mapped mappings you mentioned)

>
>> For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
>> tree node, and the mapping fails partway through, I’m left with two options: either
>> ignore the device (if it's not essential for Xen or guest functionality) and continue
>>    booting; in which case I’d need to perform a rollback, and simply knowing the number
>> of successfully mapped GFNs may not be enough or, more likely, just panic.
> Well, no. For example, before even trying to map you could check that the range
> of P2M entries covered is all empty.

Could it be that they aren't all empty? Then it seems like we have overlapping and we can't
just do a mapping, right?
Won't be this procedure consume a lot of time as it is needed to go through each page
tables for each entry.


>   _Then_ you know how to correctly roll back.
> And yes, doing so may not even require passing back information on how much of
> a region was successfully mapped.

If P2M entries were empty before start of the mapping then it is enough to just go
through the same range (sgfn,nr,smfn) and just clean them, right?

Thanks.

~ Oleksii

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Jan Beulich 3 months, 3 weeks ago

On 07.07.2025 18:10, Oleksii Kurochko wrote:
> On 7/7/25 5:15 PM, Jan Beulich wrote:
>> On 07.07.2025 17:00, Oleksii Kurochko wrote:
>>> On 7/7/25 2:53 PM, Jan Beulich wrote:
>>>> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>>>>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>>>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>> +{
>>>>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>>>>> +
>>>>>>>>> +    return false;
>>>>>>>>> +}
>>>>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>>>>> how is this going to be different from pte_is_valid()?
>>>>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>>>>       /*
>>>>>>>        * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>>>>        * the type to check whether an entry is valid.
>>>>>>>        */
>>>>>>>       static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>>>>       {
>>>>>>>           return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>>>>       }
>>>>>>>
>>>>>>> It is done to track which page was modified by a guest.
>>>>>> But then (again) the name doesn't convey what the function does.
>>>>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
>>>> For P2M type checks please don't invent new naming, but use what both x86
>>>> and Arm are already using. Note how we already have p2m_is_valid() in that
>>>> set. Just that it's not doing what you want here.
>>> Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
>>> And in here it is checked if P2M pte is valid from P2M point of view by checking
>>> the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
>>> free bits for type).
>> Because this is how it's defined on x86:
>>
>> #define p2m_is_valid(_t)    (p2m_to_mask(_t) & \
>>                               (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct)))
>>
>> I.e. more strict that simply "!= p2m_invalid". And I think such predicates
>> would better be uniform across architectures, such that in principle they
>> might also be usable in common code (as we already do with p2m_is_foreign()).
> 
> Yeah, Arm isn't so strict in definition of p2m_is_valid() and it seems like
> x86 and Arm have different understanding what is valid.
> 
> Except what mentioned in the comment that grant types aren't considered valid
> for x86 (and shouldn't be the same then for Arm?), it isn't clear why x86's
> p2m_is_valid() is stricter then Arm's one and if other arches should be also
> so strict.

Arm's p2m_is_valid() is entirely different (and imo misnamed, but arguably one
could also consider x86'es to require a better name). It's a local helper, not
a P2M type checking predicate. With that in mind, you may of course follow
Arm's model, but in the longer run we may need to do something about the name
collision then.

>>> The only use case I can think of is that the caller
>>> might try to map the remaining GFNs again. But that doesn’t seem very useful,
>>> if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
>>> issue, and retrying would probably result in the same error.
>>>
>>> The same applies to rolling back the state. It wouldn’t be difficult to add a local
>>> array to track all modified PTEs and then use it to revert the state if needed.
>>> But again, what would the caller do after the rollback? At this point, it still seems
>>> like the best option is simply to|panic(). |
>>>
>>> Basically, I don’t see or understand the cases where knowing how many GFNs were
>>> successfully mapped, or whether a rollback was performed, would really help — because
>>> in most cases, I don’t have a better option than just calling|panic()| at the end.
>> panic()-ing is of course only a last resort. Anything related to domain handling
>> would better crash only the domain in question. And even that only if suitable
>> error handling isn't possible.
> 
> And if there is no still any runnable domain available, for example, we are creating
> domain and some p2m mapping is called? Will it be enough just ignore to boot this domain?
> If yes, then it is enough to return only error code without returning how many GFNs were
> mapped or rollbacking as domain won't be ran anyway.

During domain creation all you need to do is return an error. But when you write a
generic function that's also (going to be) used at domain runtime, you need to
consider what to do there in case of partial success.

>>> For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
>>> tree node, and the mapping fails partway through, I’m left with two options: either
>>> ignore the device (if it's not essential for Xen or guest functionality) and continue
>>>    booting; in which case I’d need to perform a rollback, and simply knowing the number
>>> of successfully mapped GFNs may not be enough or, more likely, just panic.
>> Well, no. For example, before even trying to map you could check that the range
>> of P2M entries covered is all empty.
> 
> Could it be that they aren't all empty? Then it seems like we have overlapping and we can't
> just do a mapping, right?

Possibly that would simply mean to return an error, yes.

> Won't be this procedure consume a lot of time as it is needed to go through each page
> tables for each entry.

Well, you're free to suggest a clean alternative without doing so.

>>   _Then_ you know how to correctly roll back.
>> And yes, doing so may not even require passing back information on how much of
>> a region was successfully mapped.
> 
> If P2M entries were empty before start of the mapping then it is enough to just go
> through the same range (sgfn,nr,smfn) and just clean them, right?

Yes, what else would "roll back" mean in that case?

Jan

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Oleksii Kurochko 3 months, 3 weeks ago

On 7/8/25 9:10 AM, Jan Beulich wrote:
> On 07.07.2025 18:10, Oleksii Kurochko wrote:
>> On 7/7/25 5:15 PM, Jan Beulich wrote:
>>> On 07.07.2025 17:00, Oleksii Kurochko wrote:
>>>> On 7/7/25 2:53 PM, Jan Beulich wrote:
>>>>> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>>>>>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>>>>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>>>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>> +{
>>>>>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>>>>>> +
>>>>>>>>>> +    return false;
>>>>>>>>>> +}
>>>>>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>>>>>> how is this going to be different from pte_is_valid()?
>>>>>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>>>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>>>>>        /*
>>>>>>>>         * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>>>>>         * the type to check whether an entry is valid.
>>>>>>>>         */
>>>>>>>>        static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>>>>>        {
>>>>>>>>            return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>>>>>        }
>>>>>>>>
>>>>>>>> It is done to track which page was modified by a guest.
>>>>>>> But then (again) the name doesn't convey what the function does.
>>>>>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
>>>>> For P2M type checks please don't invent new naming, but use what both x86
>>>>> and Arm are already using. Note how we already have p2m_is_valid() in that
>>>>> set. Just that it's not doing what you want here.
>>>> Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
>>>> And in here it is checked if P2M pte is valid from P2M point of view by checking
>>>> the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
>>>> free bits for type).
>>> Because this is how it's defined on x86:
>>>
>>> #define p2m_is_valid(_t)    (p2m_to_mask(_t) & \
>>>                                (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct)))
>>>
>>> I.e. more strict that simply "!= p2m_invalid". And I think such predicates
>>> would better be uniform across architectures, such that in principle they
>>> might also be usable in common code (as we already do with p2m_is_foreign()).
>> Yeah, Arm isn't so strict in definition of p2m_is_valid() and it seems like
>> x86 and Arm have different understanding what is valid.
>>
>> Except what mentioned in the comment that grant types aren't considered valid
>> for x86 (and shouldn't be the same then for Arm?), it isn't clear why x86's
>> p2m_is_valid() is stricter then Arm's one and if other arches should be also
>> so strict.
> Arm's p2m_is_valid() is entirely different (and imo misnamed, but arguably one
> could also consider x86'es to require a better name). It's a local helper, not
> a P2M type checking predicate. With that in mind, you may of course follow
> Arm's model, but in the longer run we may need to do something about the name
> collision then.
>
>>>> The only use case I can think of is that the caller
>>>> might try to map the remaining GFNs again. But that doesn’t seem very useful,
>>>> if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
>>>> issue, and retrying would probably result in the same error.
>>>>
>>>> The same applies to rolling back the state. It wouldn’t be difficult to add a local
>>>> array to track all modified PTEs and then use it to revert the state if needed.
>>>> But again, what would the caller do after the rollback? At this point, it still seems
>>>> like the best option is simply to|panic(). |
>>>>
>>>> Basically, I don’t see or understand the cases where knowing how many GFNs were
>>>> successfully mapped, or whether a rollback was performed, would really help — because
>>>> in most cases, I don’t have a better option than just calling|panic()| at the end.
>>> panic()-ing is of course only a last resort. Anything related to domain handling
>>> would better crash only the domain in question. And even that only if suitable
>>> error handling isn't possible.
>> And if there is no still any runnable domain available, for example, we are creating
>> domain and some p2m mapping is called? Will it be enough just ignore to boot this domain?
>> If yes, then it is enough to return only error code without returning how many GFNs were
>> mapped or rollbacking as domain won't be ran anyway.
> During domain creation all you need to do is return an error. But when you write a
> generic function that's also (going to be) used at domain runtime, you need to
> consider what to do there in case of partial success.
>
>>>> For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
>>>> tree node, and the mapping fails partway through, I’m left with two options: either
>>>> ignore the device (if it's not essential for Xen or guest functionality) and continue
>>>>     booting; in which case I’d need to perform a rollback, and simply knowing the number
>>>> of successfully mapped GFNs may not be enough or, more likely, just panic.
>>> Well, no. For example, before even trying to map you could check that the range
>>> of P2M entries covered is all empty.
>> Could it be that they aren't all empty? Then it seems like we have overlapping and we can't
>> just do a mapping, right?
> Possibly that would simply mean to return an error, yes.
>
>> Won't be this procedure consume a lot of time as it is needed to go through each page
>> tables for each entry.
> Well, you're free to suggest a clean alternative without doing so.

I thought about dynamically allocating an array in p2m_set_entry(), where to save all changed PTEs,
and then use it to roll back if __p2m_set_entry() returns rc != 0 ...

>
>>>    _Then_ you know how to correctly roll back.
>>> And yes, doing so may not even require passing back information on how much of
>>> a region was successfully mapped.
>> If P2M entries were empty before start of the mapping then it is enough to just go
>> through the same range (sgfn,nr,smfn) and just clean them, right?
> Yes, what else would "roll back" mean in that case?

... If we know that the P2M entries were empty, then there's nothing else to be done, just
clean PTE is needed to be done.
However, if the P2M entries weren’t empty (and I’m still not sure whether that’s a legal
case), then rolling back would mean restoring their original state, the state they
had before the P2M mapping procedure started.

~ Oleksii

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Oleksii Kurochko 3 months, 3 weeks ago

On 7/8/25 11:01 AM, Oleksii Kurochko wrote:
>
>
> On 7/8/25 9:10 AM, Jan Beulich wrote:
>> On 07.07.2025 18:10, Oleksii Kurochko wrote:
>>> On 7/7/25 5:15 PM, Jan Beulich wrote:
>>>> On 07.07.2025 17:00, Oleksii Kurochko wrote:
>>>>> On 7/7/25 2:53 PM, Jan Beulich wrote:
>>>>>> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>>>>>>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>>>>>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>>>>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>> +{
>>>>>>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>>>>>>> +
>>>>>>>>>>> +    return false;
>>>>>>>>>>> +}
>>>>>>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>>>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>>>>>>> how is this going to be different from pte_is_valid()?
>>>>>>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>>>>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>>>>>>        /*
>>>>>>>>>         * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>>>>>>         * the type to check whether an entry is valid.
>>>>>>>>>         */
>>>>>>>>>        static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>>>>>>        {
>>>>>>>>>            return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>>>>>>        }
>>>>>>>>>
>>>>>>>>> It is done to track which page was modified by a guest.
>>>>>>>> But then (again) the name doesn't convey what the function does.
>>>>>>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
>>>>>> For P2M type checks please don't invent new naming, but use what both x86
>>>>>> and Arm are already using. Note how we already have p2m_is_valid() in that
>>>>>> set. Just that it's not doing what you want here.
>>>>> Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
>>>>> And in here it is checked if P2M pte is valid from P2M point of view by checking
>>>>> the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
>>>>> free bits for type).
>>>> Because this is how it's defined on x86:
>>>>
>>>> #define p2m_is_valid(_t)    (p2m_to_mask(_t) & \
>>>>                                (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct)))
>>>>
>>>> I.e. more strict that simply "!= p2m_invalid". And I think such predicates
>>>> would better be uniform across architectures, such that in principle they
>>>> might also be usable in common code (as we already do with p2m_is_foreign()).
>>> Yeah, Arm isn't so strict in definition of p2m_is_valid() and it seems like
>>> x86 and Arm have different understanding what is valid.
>>>
>>> Except what mentioned in the comment that grant types aren't considered valid
>>> for x86 (and shouldn't be the same then for Arm?), it isn't clear why x86's
>>> p2m_is_valid() is stricter then Arm's one and if other arches should be also
>>> so strict.
>> Arm's p2m_is_valid() is entirely different (and imo misnamed, but arguably one
>> could also consider x86'es to require a better name). It's a local helper, not
>> a P2M type checking predicate. With that in mind, you may of course follow
>> Arm's model, but in the longer run we may need to do something about the name
>> collision then.
>>
>>>>> The only use case I can think of is that the caller
>>>>> might try to map the remaining GFNs again. But that doesn’t seem very useful,
>>>>> if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
>>>>> issue, and retrying would probably result in the same error.
>>>>>
>>>>> The same applies to rolling back the state. It wouldn’t be difficult to add a local
>>>>> array to track all modified PTEs and then use it to revert the state if needed.
>>>>> But again, what would the caller do after the rollback? At this point, it still seems
>>>>> like the best option is simply to|panic(). |
>>>>>
>>>>> Basically, I don’t see or understand the cases where knowing how many GFNs were
>>>>> successfully mapped, or whether a rollback was performed, would really help — because
>>>>> in most cases, I don’t have a better option than just calling|panic()| at the end.
>>>> panic()-ing is of course only a last resort. Anything related to domain handling
>>>> would better crash only the domain in question. And even that only if suitable
>>>> error handling isn't possible.
>>> And if there is no still any runnable domain available, for example, we are creating
>>> domain and some p2m mapping is called? Will it be enough just ignore to boot this domain?
>>> If yes, then it is enough to return only error code without returning how many GFNs were
>>> mapped or rollbacking as domain won't be ran anyway.
>> During domain creation all you need to do is return an error. But when you write a
>> generic function that's also (going to be) used at domain runtime, you need to
>> consider what to do there in case of partial success.
>>
>>>>> For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
>>>>> tree node, and the mapping fails partway through, I’m left with two options: either
>>>>> ignore the device (if it's not essential for Xen or guest functionality) and continue
>>>>>     booting; in which case I’d need to perform a rollback, and simply knowing the number
>>>>> of successfully mapped GFNs may not be enough or, more likely, just panic.
>>>> Well, no. For example, before even trying to map you could check that the range
>>>> of P2M entries covered is all empty.
>>> Could it be that they aren't all empty? Then it seems like we have overlapping and we can't
>>> just do a mapping, right?
>> Possibly that would simply mean to return an error, yes.
>>
>>> Won't be this procedure consume a lot of time as it is needed to go through each page
>>> tables for each entry.
>> Well, you're free to suggest a clean alternative without doing so.
> I thought about dynamically allocating an array in p2m_set_entry(), where to save all changed PTEs,
> and then use it to roll back if __p2m_set_entry() returns rc != 0 ...
>
>>>>    _Then_ you know how to correctly roll back.
>>>> And yes, doing so may not even require passing back information on how much of
>>>> a region was successfully mapped.
>>> If P2M entries were empty before start of the mapping then it is enough to just go
>>> through the same range (sgfn,nr,smfn) and just clean them, right?
>> Yes, what else would "roll back" mean in that case?
> ... If we know that the P2M entries were empty, then there's nothing else to be done, just
> clean PTE is needed to be done.
> However, if the P2M entries weren’t empty (and I’m still not sure whether that’s a legal
> case), then rolling back would mean restoring their original state, the state they
> had before the P2M mapping procedure started.

Possible roll back is harder to implement as expected because there is a case where subtree
could be freed:
     /*
      * Free the entry only if the original pte was valid and the base
      * is different (to avoid freeing when permission is changed).
      */
     if ( p2me_is_valid(p2m, orig_pte) &&
          !mfn_eq(pte_get_mfn(*entry), pte_get_mfn(orig_pte)) )
         p2m_free_subtree(p2m, orig_pte, level);
In this case then it will be needed to store the full subtree.

~ Oleksii

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Jan Beulich 3 months, 3 weeks ago

On 08.07.2025 12:37, Oleksii Kurochko wrote:
> 
> On 7/8/25 11:01 AM, Oleksii Kurochko wrote:
>>
>>
>> On 7/8/25 9:10 AM, Jan Beulich wrote:
>>> On 07.07.2025 18:10, Oleksii Kurochko wrote:
>>>> On 7/7/25 5:15 PM, Jan Beulich wrote:
>>>>> On 07.07.2025 17:00, Oleksii Kurochko wrote:
>>>>>> On 7/7/25 2:53 PM, Jan Beulich wrote:
>>>>>>> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>>>>>>>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>>>>>>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>>>>>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>>>>>>>> +
>>>>>>>>>>>> +    return false;
>>>>>>>>>>>> +}
>>>>>>>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>>>>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>>>>>>>> how is this going to be different from pte_is_valid()?
>>>>>>>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>>>>>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>>>>>>>        /*
>>>>>>>>>>         * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>>>>>>>         * the type to check whether an entry is valid.
>>>>>>>>>>         */
>>>>>>>>>>        static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>>>>>>>        {
>>>>>>>>>>            return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>>>>>>>        }
>>>>>>>>>>
>>>>>>>>>> It is done to track which page was modified by a guest.
>>>>>>>>> But then (again) the name doesn't convey what the function does.
>>>>>>>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
>>>>>>> For P2M type checks please don't invent new naming, but use what both x86
>>>>>>> and Arm are already using. Note how we already have p2m_is_valid() in that
>>>>>>> set. Just that it's not doing what you want here.
>>>>>> Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
>>>>>> And in here it is checked if P2M pte is valid from P2M point of view by checking
>>>>>> the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
>>>>>> free bits for type).
>>>>> Because this is how it's defined on x86:
>>>>>
>>>>> #define p2m_is_valid(_t)    (p2m_to_mask(_t) & \
>>>>>                                (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct)))
>>>>>
>>>>> I.e. more strict that simply "!= p2m_invalid". And I think such predicates
>>>>> would better be uniform across architectures, such that in principle they
>>>>> might also be usable in common code (as we already do with p2m_is_foreign()).
>>>> Yeah, Arm isn't so strict in definition of p2m_is_valid() and it seems like
>>>> x86 and Arm have different understanding what is valid.
>>>>
>>>> Except what mentioned in the comment that grant types aren't considered valid
>>>> for x86 (and shouldn't be the same then for Arm?), it isn't clear why x86's
>>>> p2m_is_valid() is stricter then Arm's one and if other arches should be also
>>>> so strict.
>>> Arm's p2m_is_valid() is entirely different (and imo misnamed, but arguably one
>>> could also consider x86'es to require a better name). It's a local helper, not
>>> a P2M type checking predicate. With that in mind, you may of course follow
>>> Arm's model, but in the longer run we may need to do something about the name
>>> collision then.
>>>
>>>>>> The only use case I can think of is that the caller
>>>>>> might try to map the remaining GFNs again. But that doesn’t seem very useful,
>>>>>> if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
>>>>>> issue, and retrying would probably result in the same error.
>>>>>>
>>>>>> The same applies to rolling back the state. It wouldn’t be difficult to add a local
>>>>>> array to track all modified PTEs and then use it to revert the state if needed.
>>>>>> But again, what would the caller do after the rollback? At this point, it still seems
>>>>>> like the best option is simply to|panic(). |
>>>>>>
>>>>>> Basically, I don’t see or understand the cases where knowing how many GFNs were
>>>>>> successfully mapped, or whether a rollback was performed, would really help — because
>>>>>> in most cases, I don’t have a better option than just calling|panic()| at the end.
>>>>> panic()-ing is of course only a last resort. Anything related to domain handling
>>>>> would better crash only the domain in question. And even that only if suitable
>>>>> error handling isn't possible.
>>>> And if there is no still any runnable domain available, for example, we are creating
>>>> domain and some p2m mapping is called? Will it be enough just ignore to boot this domain?
>>>> If yes, then it is enough to return only error code without returning how many GFNs were
>>>> mapped or rollbacking as domain won't be ran anyway.
>>> During domain creation all you need to do is return an error. But when you write a
>>> generic function that's also (going to be) used at domain runtime, you need to
>>> consider what to do there in case of partial success.
>>>
>>>>>> For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
>>>>>> tree node, and the mapping fails partway through, I’m left with two options: either
>>>>>> ignore the device (if it's not essential for Xen or guest functionality) and continue
>>>>>>     booting; in which case I’d need to perform a rollback, and simply knowing the number
>>>>>> of successfully mapped GFNs may not be enough or, more likely, just panic.
>>>>> Well, no. For example, before even trying to map you could check that the range
>>>>> of P2M entries covered is all empty.
>>>> Could it be that they aren't all empty? Then it seems like we have overlapping and we can't
>>>> just do a mapping, right?
>>> Possibly that would simply mean to return an error, yes.
>>>
>>>> Won't be this procedure consume a lot of time as it is needed to go through each page
>>>> tables for each entry.
>>> Well, you're free to suggest a clean alternative without doing so.
>> I thought about dynamically allocating an array in p2m_set_entry(), where to save all changed PTEs,
>> and then use it to roll back if __p2m_set_entry() returns rc != 0 ...

That's another possible source for failure, and such an allocation may end
up being a rather big one.

>>>>>    _Then_ you know how to correctly roll back.
>>>>> And yes, doing so may not even require passing back information on how much of
>>>>> a region was successfully mapped.
>>>> If P2M entries were empty before start of the mapping then it is enough to just go
>>>> through the same range (sgfn,nr,smfn) and just clean them, right?
>>> Yes, what else would "roll back" mean in that case?
>> ... If we know that the P2M entries were empty, then there's nothing else to be done, just
>> clean PTE is needed to be done.
>> However, if the P2M entries weren’t empty (and I’m still not sure whether that’s a legal
>> case), then rolling back would mean restoring their original state, the state they
>> had before the P2M mapping procedure started.
> 
> Possible roll back is harder to implement as expected because there is a case where subtree
> could be freed:
>      /*
>       * Free the entry only if the original pte was valid and the base
>       * is different (to avoid freeing when permission is changed).
>       */
>      if ( p2me_is_valid(p2m, orig_pte) &&
>           !mfn_eq(pte_get_mfn(*entry), pte_get_mfn(orig_pte)) )
>          p2m_free_subtree(p2m, orig_pte, level);
> In this case then it will be needed to store the full subtree.

Right, which is why it may be desirable to limit the ability to update multiple
entries in one go. Or work from certain assumptions, violation of which would
cause the domain to be crashed.

Jan

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Oleksii Kurochko 3 months, 3 weeks ago

On 7/8/25 2:45 PM, Jan Beulich wrote:
> On 08.07.2025 12:37, Oleksii Kurochko wrote:
>> On 7/8/25 11:01 AM, Oleksii Kurochko wrote:
>>>
>>> On 7/8/25 9:10 AM, Jan Beulich wrote:
>>>> On 07.07.2025 18:10, Oleksii Kurochko wrote:
>>>>> On 7/7/25 5:15 PM, Jan Beulich wrote:
>>>>>> On 07.07.2025 17:00, Oleksii Kurochko wrote:
>>>>>>> On 7/7/25 2:53 PM, Jan Beulich wrote:
>>>>>>>> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>>>>>>>>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>>>>>>>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>>>>>>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>>>>>>>>> +
>>>>>>>>>>>>> +    return false;
>>>>>>>>>>>>> +}
>>>>>>>>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>>>>>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>>>>>>>>> how is this going to be different from pte_is_valid()?
>>>>>>>>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>>>>>>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>>>>>>>>         /*
>>>>>>>>>>>          * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>>>>>>>>          * the type to check whether an entry is valid.
>>>>>>>>>>>          */
>>>>>>>>>>>         static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>>>>>>>>         {
>>>>>>>>>>>             return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>>>>>>>>         }
>>>>>>>>>>>
>>>>>>>>>>> It is done to track which page was modified by a guest.
>>>>>>>>>> But then (again) the name doesn't convey what the function does.
>>>>>>>>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
>>>>>>>> For P2M type checks please don't invent new naming, but use what both x86
>>>>>>>> and Arm are already using. Note how we already have p2m_is_valid() in that
>>>>>>>> set. Just that it's not doing what you want here.
>>>>>>> Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
>>>>>>> And in here it is checked if P2M pte is valid from P2M point of view by checking
>>>>>>> the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
>>>>>>> free bits for type).
>>>>>> Because this is how it's defined on x86:
>>>>>>
>>>>>> #define p2m_is_valid(_t)    (p2m_to_mask(_t) & \
>>>>>>                                 (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct)))
>>>>>>
>>>>>> I.e. more strict that simply "!= p2m_invalid". And I think such predicates
>>>>>> would better be uniform across architectures, such that in principle they
>>>>>> might also be usable in common code (as we already do with p2m_is_foreign()).
>>>>> Yeah, Arm isn't so strict in definition of p2m_is_valid() and it seems like
>>>>> x86 and Arm have different understanding what is valid.
>>>>>
>>>>> Except what mentioned in the comment that grant types aren't considered valid
>>>>> for x86 (and shouldn't be the same then for Arm?), it isn't clear why x86's
>>>>> p2m_is_valid() is stricter then Arm's one and if other arches should be also
>>>>> so strict.
>>>> Arm's p2m_is_valid() is entirely different (and imo misnamed, but arguably one
>>>> could also consider x86'es to require a better name). It's a local helper, not
>>>> a P2M type checking predicate. With that in mind, you may of course follow
>>>> Arm's model, but in the longer run we may need to do something about the name
>>>> collision then.
>>>>
>>>>>>> The only use case I can think of is that the caller
>>>>>>> might try to map the remaining GFNs again. But that doesn’t seem very useful,
>>>>>>> if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
>>>>>>> issue, and retrying would probably result in the same error.
>>>>>>>
>>>>>>> The same applies to rolling back the state. It wouldn’t be difficult to add a local
>>>>>>> array to track all modified PTEs and then use it to revert the state if needed.
>>>>>>> But again, what would the caller do after the rollback? At this point, it still seems
>>>>>>> like the best option is simply to|panic(). |
>>>>>>>
>>>>>>> Basically, I don’t see or understand the cases where knowing how many GFNs were
>>>>>>> successfully mapped, or whether a rollback was performed, would really help — because
>>>>>>> in most cases, I don’t have a better option than just calling|panic()| at the end.
>>>>>> panic()-ing is of course only a last resort. Anything related to domain handling
>>>>>> would better crash only the domain in question. And even that only if suitable
>>>>>> error handling isn't possible.
>>>>> And if there is no still any runnable domain available, for example, we are creating
>>>>> domain and some p2m mapping is called? Will it be enough just ignore to boot this domain?
>>>>> If yes, then it is enough to return only error code without returning how many GFNs were
>>>>> mapped or rollbacking as domain won't be ran anyway.
>>>> During domain creation all you need to do is return an error. But when you write a
>>>> generic function that's also (going to be) used at domain runtime, you need to
>>>> consider what to do there in case of partial success.
>>>>
>>>>>>> For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
>>>>>>> tree node, and the mapping fails partway through, I’m left with two options: either
>>>>>>> ignore the device (if it's not essential for Xen or guest functionality) and continue
>>>>>>>      booting; in which case I’d need to perform a rollback, and simply knowing the number
>>>>>>> of successfully mapped GFNs may not be enough or, more likely, just panic.
>>>>>> Well, no. For example, before even trying to map you could check that the range
>>>>>> of P2M entries covered is all empty.
>>>>> Could it be that they aren't all empty? Then it seems like we have overlapping and we can't
>>>>> just do a mapping, right?
>>>> Possibly that would simply mean to return an error, yes.
>>>>
>>>>> Won't be this procedure consume a lot of time as it is needed to go through each page
>>>>> tables for each entry.
>>>> Well, you're free to suggest a clean alternative without doing so.
>>> I thought about dynamically allocating an array in p2m_set_entry(), where to save all changed PTEs,
>>> and then use it to roll back if __p2m_set_entry() returns rc != 0 ...
> That's another possible source for failure, and such an allocation may end
> up being a rather big one.
>
>>>>>>     _Then_ you know how to correctly roll back.
>>>>>> And yes, doing so may not even require passing back information on how much of
>>>>>> a region was successfully mapped.
>>>>> If P2M entries were empty before start of the mapping then it is enough to just go
>>>>> through the same range (sgfn,nr,smfn) and just clean them, right?
>>>> Yes, what else would "roll back" mean in that case?
>>> ... If we know that the P2M entries were empty, then there's nothing else to be done, just
>>> clean PTE is needed to be done.
>>> However, if the P2M entries weren’t empty (and I’m still not sure whether that’s a legal
>>> case), then rolling back would mean restoring their original state, the state they
>>> had before the P2M mapping procedure started.
>> Possible roll back is harder to implement as expected because there is a case where subtree
>> could be freed:
>>       /*
>>        * Free the entry only if the original pte was valid and the base
>>        * is different (to avoid freeing when permission is changed).
>>        */
>>       if ( p2me_is_valid(p2m, orig_pte) &&
>>            !mfn_eq(pte_get_mfn(*entry), pte_get_mfn(orig_pte)) )
>>           p2m_free_subtree(p2m, orig_pte, level);
>> In this case then it will be needed to store the full subtree.
> Right, which is why it may be desirable to limit the ability to update multiple
> entries in one go. Or work from certain assumptions, violation of which would
> cause the domain to be crashed.

It seems to me that the main issue with updating multiple entries in one go is the rollback
mechanism in case of a partial mapping failure. (other issues? mapping could consume a lot
of time so something should wait while allocation will end?) In my opinion, the rollback
mechanism is quite complex to implement and could become a source of further failures.
For example, most of the cases where p2m_set_entry() could fail are due to failure in
mapping the page table (to allow Xen to walk through it) or failure in creating a new page
table due to memory exhaustion. Then, during rollback, which might also require memory
allocation, we could face the same memory shortage issue.
And what should be done in that case?

In my opinion, the best option is to simply return from p2m_set_entry() the number of
successfully mapped GFNs (stored in rc which is returned by p2m_set_entry()) and let
the caller decide how to handle the partial mapping:
1. If a partial mapping occurs during domain creation, we could just report that this
    domain can't be created and continue without it if there are other domains to start;
    otherwise, panic.
2. If a partial mapping occurs during the lifetime of a domain, for example, if the domain
    requests to map some memory, we return the number of successfully mapped GFNs and let the
    domain decide what to do: either remove the mappings or retry mapping the remaining part.
    However, I think there's not much value in retrying, since p2m_set_entry() is likely to
    fail again. So, perhaps the best course of action is to stop the domain altogether.
Does that make sense?

~ Oleksii

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Jan Beulich 3 months, 3 weeks ago

On 08.07.2025 17:42, Oleksii Kurochko wrote:
> 
> On 7/8/25 2:45 PM, Jan Beulich wrote:
>> On 08.07.2025 12:37, Oleksii Kurochko wrote:
>>> On 7/8/25 11:01 AM, Oleksii Kurochko wrote:
>>>>
>>>> On 7/8/25 9:10 AM, Jan Beulich wrote:
>>>>> On 07.07.2025 18:10, Oleksii Kurochko wrote:
>>>>>> On 7/7/25 5:15 PM, Jan Beulich wrote:
>>>>>>> On 07.07.2025 17:00, Oleksii Kurochko wrote:
>>>>>>>> On 7/7/25 2:53 PM, Jan Beulich wrote:
>>>>>>>>> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>>>>>>>>>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>>>>>>>>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>>>>>>>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>>>>>>>>>> +
>>>>>>>>>>>>>> +    return false;
>>>>>>>>>>>>>> +}
>>>>>>>>>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>>>>>>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>>>>>>>>>> how is this going to be different from pte_is_valid()?
>>>>>>>>>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>>>>>>>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>>>>>>>>>         /*
>>>>>>>>>>>>          * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>>>>>>>>>          * the type to check whether an entry is valid.
>>>>>>>>>>>>          */
>>>>>>>>>>>>         static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>>>>>>>>>         {
>>>>>>>>>>>>             return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>>>>>>>>>         }
>>>>>>>>>>>>
>>>>>>>>>>>> It is done to track which page was modified by a guest.
>>>>>>>>>>> But then (again) the name doesn't convey what the function does.
>>>>>>>>>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
>>>>>>>>> For P2M type checks please don't invent new naming, but use what both x86
>>>>>>>>> and Arm are already using. Note how we already have p2m_is_valid() in that
>>>>>>>>> set. Just that it's not doing what you want here.
>>>>>>>> Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
>>>>>>>> And in here it is checked if P2M pte is valid from P2M point of view by checking
>>>>>>>> the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
>>>>>>>> free bits for type).
>>>>>>> Because this is how it's defined on x86:
>>>>>>>
>>>>>>> #define p2m_is_valid(_t)    (p2m_to_mask(_t) & \
>>>>>>>                                 (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct)))
>>>>>>>
>>>>>>> I.e. more strict that simply "!= p2m_invalid". And I think such predicates
>>>>>>> would better be uniform across architectures, such that in principle they
>>>>>>> might also be usable in common code (as we already do with p2m_is_foreign()).
>>>>>> Yeah, Arm isn't so strict in definition of p2m_is_valid() and it seems like
>>>>>> x86 and Arm have different understanding what is valid.
>>>>>>
>>>>>> Except what mentioned in the comment that grant types aren't considered valid
>>>>>> for x86 (and shouldn't be the same then for Arm?), it isn't clear why x86's
>>>>>> p2m_is_valid() is stricter then Arm's one and if other arches should be also
>>>>>> so strict.
>>>>> Arm's p2m_is_valid() is entirely different (and imo misnamed, but arguably one
>>>>> could also consider x86'es to require a better name). It's a local helper, not
>>>>> a P2M type checking predicate. With that in mind, you may of course follow
>>>>> Arm's model, but in the longer run we may need to do something about the name
>>>>> collision then.
>>>>>
>>>>>>>> The only use case I can think of is that the caller
>>>>>>>> might try to map the remaining GFNs again. But that doesn’t seem very useful,
>>>>>>>> if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
>>>>>>>> issue, and retrying would probably result in the same error.
>>>>>>>>
>>>>>>>> The same applies to rolling back the state. It wouldn’t be difficult to add a local
>>>>>>>> array to track all modified PTEs and then use it to revert the state if needed.
>>>>>>>> But again, what would the caller do after the rollback? At this point, it still seems
>>>>>>>> like the best option is simply to|panic(). |
>>>>>>>>
>>>>>>>> Basically, I don’t see or understand the cases where knowing how many GFNs were
>>>>>>>> successfully mapped, or whether a rollback was performed, would really help — because
>>>>>>>> in most cases, I don’t have a better option than just calling|panic()| at the end.
>>>>>>> panic()-ing is of course only a last resort. Anything related to domain handling
>>>>>>> would better crash only the domain in question. And even that only if suitable
>>>>>>> error handling isn't possible.
>>>>>> And if there is no still any runnable domain available, for example, we are creating
>>>>>> domain and some p2m mapping is called? Will it be enough just ignore to boot this domain?
>>>>>> If yes, then it is enough to return only error code without returning how many GFNs were
>>>>>> mapped or rollbacking as domain won't be ran anyway.
>>>>> During domain creation all you need to do is return an error. But when you write a
>>>>> generic function that's also (going to be) used at domain runtime, you need to
>>>>> consider what to do there in case of partial success.
>>>>>
>>>>>>>> For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
>>>>>>>> tree node, and the mapping fails partway through, I’m left with two options: either
>>>>>>>> ignore the device (if it's not essential for Xen or guest functionality) and continue
>>>>>>>>      booting; in which case I’d need to perform a rollback, and simply knowing the number
>>>>>>>> of successfully mapped GFNs may not be enough or, more likely, just panic.
>>>>>>> Well, no. For example, before even trying to map you could check that the range
>>>>>>> of P2M entries covered is all empty.
>>>>>> Could it be that they aren't all empty? Then it seems like we have overlapping and we can't
>>>>>> just do a mapping, right?
>>>>> Possibly that would simply mean to return an error, yes.
>>>>>
>>>>>> Won't be this procedure consume a lot of time as it is needed to go through each page
>>>>>> tables for each entry.
>>>>> Well, you're free to suggest a clean alternative without doing so.
>>>> I thought about dynamically allocating an array in p2m_set_entry(), where to save all changed PTEs,
>>>> and then use it to roll back if __p2m_set_entry() returns rc != 0 ...
>> That's another possible source for failure, and such an allocation may end
>> up being a rather big one.
>>
>>>>>>>     _Then_ you know how to correctly roll back.
>>>>>>> And yes, doing so may not even require passing back information on how much of
>>>>>>> a region was successfully mapped.
>>>>>> If P2M entries were empty before start of the mapping then it is enough to just go
>>>>>> through the same range (sgfn,nr,smfn) and just clean them, right?
>>>>> Yes, what else would "roll back" mean in that case?
>>>> ... If we know that the P2M entries were empty, then there's nothing else to be done, just
>>>> clean PTE is needed to be done.
>>>> However, if the P2M entries weren’t empty (and I’m still not sure whether that’s a legal
>>>> case), then rolling back would mean restoring their original state, the state they
>>>> had before the P2M mapping procedure started.
>>> Possible roll back is harder to implement as expected because there is a case where subtree
>>> could be freed:
>>>       /*
>>>        * Free the entry only if the original pte was valid and the base
>>>        * is different (to avoid freeing when permission is changed).
>>>        */
>>>       if ( p2me_is_valid(p2m, orig_pte) &&
>>>            !mfn_eq(pte_get_mfn(*entry), pte_get_mfn(orig_pte)) )
>>>           p2m_free_subtree(p2m, orig_pte, level);
>>> In this case then it will be needed to store the full subtree.
>> Right, which is why it may be desirable to limit the ability to update multiple
>> entries in one go. Or work from certain assumptions, violation of which would
>> cause the domain to be crashed.
> 
> It seems to me that the main issue with updating multiple entries in one go is the rollback
> mechanism in case of a partial mapping failure. (other issues? mapping could consume a lot
> of time so something should wait while allocation will end?) In my opinion, the rollback
> mechanism is quite complex to implement and could become a source of further failures.
> For example, most of the cases where p2m_set_entry() could fail are due to failure in
> mapping the page table (to allow Xen to walk through it) or failure in creating a new page
> table due to memory exhaustion. Then, during rollback, which might also require memory
> allocation, we could face the same memory shortage issue.
> And what should be done in that case?
> 
> In my opinion, the best option is to simply return from p2m_set_entry() the number of
> successfully mapped GFNs (stored in rc which is returned by p2m_set_entry()) and let
> the caller decide how to handle the partial mapping:
> 1. If a partial mapping occurs during domain creation, we could just report that this
>     domain can't be created and continue without it if there are other domains to start;
>     otherwise, panic.

I don't see how panic()-ing is relevant here. That's to be decided (far) up
the call stack.

> 2. If a partial mapping occurs during the lifetime of a domain, for example, if the domain
>     requests to map some memory, we return the number of successfully mapped GFNs and let the
>     domain decide what to do: either remove the mappings or retry mapping the remaining part.
>     However, I think there's not much value in retrying, since p2m_set_entry() is likely to
>     fail again. So, perhaps the best course of action is to stop the domain altogether.
> Does that make sense?

Sure, why not. Provided you actually have a way to communicate back how much
was mapped.

Jan

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Oleksii Kurochko 3 months, 3 weeks ago

On 7/8/25 6:04 PM, Jan Beulich wrote:
> On 08.07.2025 17:42, Oleksii Kurochko wrote:
>> On 7/8/25 2:45 PM, Jan Beulich wrote:
>>> On 08.07.2025 12:37, Oleksii Kurochko wrote:
>>>> On 7/8/25 11:01 AM, Oleksii Kurochko wrote:
>>>>> On 7/8/25 9:10 AM, Jan Beulich wrote:
>>>>>> On 07.07.2025 18:10, Oleksii Kurochko wrote:
>>>>>>> On 7/7/25 5:15 PM, Jan Beulich wrote:
>>>>>>>> On 07.07.2025 17:00, Oleksii Kurochko wrote:
>>>>>>>>> On 7/7/25 2:53 PM, Jan Beulich wrote:
>>>>>>>>>> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>>>>>>>>>>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>>>>>>>>>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>>>>>>>>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>> +    return false;
>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>>>>>>>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>>>>>>>>>>> how is this going to be different from pte_is_valid()?
>>>>>>>>>>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>>>>>>>>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>>>>>>>>>>          /*
>>>>>>>>>>>>>           * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>>>>>>>>>>           * the type to check whether an entry is valid.
>>>>>>>>>>>>>           */
>>>>>>>>>>>>>          static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>>>>>>>>>>          {
>>>>>>>>>>>>>              return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>>>>>>>>>>          }
>>>>>>>>>>>>>
>>>>>>>>>>>>> It is done to track which page was modified by a guest.
>>>>>>>>>>>> But then (again) the name doesn't convey what the function does.
>>>>>>>>>>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
>>>>>>>>>> For P2M type checks please don't invent new naming, but use what both x86
>>>>>>>>>> and Arm are already using. Note how we already have p2m_is_valid() in that
>>>>>>>>>> set. Just that it's not doing what you want here.
>>>>>>>>> Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
>>>>>>>>> And in here it is checked if P2M pte is valid from P2M point of view by checking
>>>>>>>>> the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
>>>>>>>>> free bits for type).
>>>>>>>> Because this is how it's defined on x86:
>>>>>>>>
>>>>>>>> #define p2m_is_valid(_t)    (p2m_to_mask(_t) & \
>>>>>>>>                                  (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct)))
>>>>>>>>
>>>>>>>> I.e. more strict that simply "!= p2m_invalid". And I think such predicates
>>>>>>>> would better be uniform across architectures, such that in principle they
>>>>>>>> might also be usable in common code (as we already do with p2m_is_foreign()).
>>>>>>> Yeah, Arm isn't so strict in definition of p2m_is_valid() and it seems like
>>>>>>> x86 and Arm have different understanding what is valid.
>>>>>>>
>>>>>>> Except what mentioned in the comment that grant types aren't considered valid
>>>>>>> for x86 (and shouldn't be the same then for Arm?), it isn't clear why x86's
>>>>>>> p2m_is_valid() is stricter then Arm's one and if other arches should be also
>>>>>>> so strict.
>>>>>> Arm's p2m_is_valid() is entirely different (and imo misnamed, but arguably one
>>>>>> could also consider x86'es to require a better name). It's a local helper, not
>>>>>> a P2M type checking predicate. With that in mind, you may of course follow
>>>>>> Arm's model, but in the longer run we may need to do something about the name
>>>>>> collision then.
>>>>>>
>>>>>>>>> The only use case I can think of is that the caller
>>>>>>>>> might try to map the remaining GFNs again. But that doesn’t seem very useful,
>>>>>>>>> if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
>>>>>>>>> issue, and retrying would probably result in the same error.
>>>>>>>>>
>>>>>>>>> The same applies to rolling back the state. It wouldn’t be difficult to add a local
>>>>>>>>> array to track all modified PTEs and then use it to revert the state if needed.
>>>>>>>>> But again, what would the caller do after the rollback? At this point, it still seems
>>>>>>>>> like the best option is simply to|panic(). |
>>>>>>>>>
>>>>>>>>> Basically, I don’t see or understand the cases where knowing how many GFNs were
>>>>>>>>> successfully mapped, or whether a rollback was performed, would really help — because
>>>>>>>>> in most cases, I don’t have a better option than just calling|panic()| at the end.
>>>>>>>> panic()-ing is of course only a last resort. Anything related to domain handling
>>>>>>>> would better crash only the domain in question. And even that only if suitable
>>>>>>>> error handling isn't possible.
>>>>>>> And if there is no still any runnable domain available, for example, we are creating
>>>>>>> domain and some p2m mapping is called? Will it be enough just ignore to boot this domain?
>>>>>>> If yes, then it is enough to return only error code without returning how many GFNs were
>>>>>>> mapped or rollbacking as domain won't be ran anyway.
>>>>>> During domain creation all you need to do is return an error. But when you write a
>>>>>> generic function that's also (going to be) used at domain runtime, you need to
>>>>>> consider what to do there in case of partial success.
>>>>>>
>>>>>>>>> For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
>>>>>>>>> tree node, and the mapping fails partway through, I’m left with two options: either
>>>>>>>>> ignore the device (if it's not essential for Xen or guest functionality) and continue
>>>>>>>>>       booting; in which case I’d need to perform a rollback, and simply knowing the number
>>>>>>>>> of successfully mapped GFNs may not be enough or, more likely, just panic.
>>>>>>>> Well, no. For example, before even trying to map you could check that the range
>>>>>>>> of P2M entries covered is all empty.
>>>>>>> Could it be that they aren't all empty? Then it seems like we have overlapping and we can't
>>>>>>> just do a mapping, right?
>>>>>> Possibly that would simply mean to return an error, yes.
>>>>>>
>>>>>>> Won't be this procedure consume a lot of time as it is needed to go through each page
>>>>>>> tables for each entry.
>>>>>> Well, you're free to suggest a clean alternative without doing so.
>>>>> I thought about dynamically allocating an array in p2m_set_entry(), where to save all changed PTEs,
>>>>> and then use it to roll back if __p2m_set_entry() returns rc != 0 ...
>>> That's another possible source for failure, and such an allocation may end
>>> up being a rather big one.
>>>
>>>>>>>>      _Then_ you know how to correctly roll back.
>>>>>>>> And yes, doing so may not even require passing back information on how much of
>>>>>>>> a region was successfully mapped.
>>>>>>> If P2M entries were empty before start of the mapping then it is enough to just go
>>>>>>> through the same range (sgfn,nr,smfn) and just clean them, right?
>>>>>> Yes, what else would "roll back" mean in that case?
>>>>> ... If we know that the P2M entries were empty, then there's nothing else to be done, just
>>>>> clean PTE is needed to be done.
>>>>> However, if the P2M entries weren’t empty (and I’m still not sure whether that’s a legal
>>>>> case), then rolling back would mean restoring their original state, the state they
>>>>> had before the P2M mapping procedure started.
>>>> Possible roll back is harder to implement as expected because there is a case where subtree
>>>> could be freed:
>>>>        /*
>>>>         * Free the entry only if the original pte was valid and the base
>>>>         * is different (to avoid freeing when permission is changed).
>>>>         */
>>>>        if ( p2me_is_valid(p2m, orig_pte) &&
>>>>             !mfn_eq(pte_get_mfn(*entry), pte_get_mfn(orig_pte)) )
>>>>            p2m_free_subtree(p2m, orig_pte, level);
>>>> In this case then it will be needed to store the full subtree.
>>> Right, which is why it may be desirable to limit the ability to update multiple
>>> entries in one go. Or work from certain assumptions, violation of which would
>>> cause the domain to be crashed.
>> It seems to me that the main issue with updating multiple entries in one go is the rollback
>> mechanism in case of a partial mapping failure. (other issues? mapping could consume a lot
>> of time so something should wait while allocation will end?) In my opinion, the rollback
>> mechanism is quite complex to implement and could become a source of further failures.
>> For example, most of the cases where p2m_set_entry() could fail are due to failure in
>> mapping the page table (to allow Xen to walk through it) or failure in creating a new page
>> table due to memory exhaustion. Then, during rollback, which might also require memory
>> allocation, we could face the same memory shortage issue.
>> And what should be done in that case?
>>
>> In my opinion, the best option is to simply return from p2m_set_entry() the number of
>> successfully mapped GFNs (stored in rc which is returned by p2m_set_entry()) and let
>> the caller decide how to handle the partial mapping:
>> 1. If a partial mapping occurs during domain creation, we could just report that this
>>      domain can't be created and continue without it if there are other domains to start;
>>      otherwise, panic.
> I don't see how panic()-ing is relevant here. That's to be decided (far) up
> the call stack.

So it's just a question of whether the caller should panic() or propagate the return
value (error code) up the call stack.

For example, in case of domain construction return value is propogate almost to  the top
of the stack:
   p2m_set_entry(p2m_access_t a, p2m_type_t t, mfn_t smfn, unsigned long nr, gfn_t sgfn, struct p2m_domain * p2m) (/run/media/ok/blue_disk//xen/xen/arch/riscv/p2m.c:1005)
   p2m_insert_mapping(struct domain * d, gfn_t start_gfn, unsigned long nr, mfn_t mfn, p2m_type_t t) (/run/media/ok/blue_disk//xen/xen/arch/riscv/p2m.c:1055)
   guest_physmap_add_entry(struct domain * d, gfn_t gfn, mfn_t mfn, unsigned long page_order, p2m_type_t t) (/run/media/ok/blue_disk//xen/xen/arch/riscv/p2m.c:1076)
   guest_physmap_add_page(unsigned int page_order, struct domain * d) (/run/media/ok/blue_disk//xen/xen/arch/riscv/include/asm/p2m.h:152)
   guest_map_pages(struct domain * d, struct page_info * pg, unsigned int order, void * extra) (/run/media/ok/blue_disk//xen/xen/common/device-tree/domain-build.c:63)
   allocate_domheap_memory(struct domain * d, paddr_t tot_size, alloc_domheap_mem_cb cb, void * extra) (/run/media/ok/blue_disk//xen/xen/common/device-tree/domain-build.c:47)
   allocate_bank_memory(struct kernel_info * kinfo, gfn_t sgfn, paddr_t tot_size) (/run/media/ok/blue_disk//xen/xen/common/device-tree/domain-build.c:99)
   allocate_memory(struct domain * d, struct kernel_info * kinfo) (/run/media/ok/blue_disk//xen/xen/include/xen/mm-frame.h:43)
   construct_domU(struct domain * d, const struct dt_device_node * node) (/run/media/ok/blue_disk//xen/xen/common/device-tree/dom0less-build.c:835)
   create_domUs() (/run/media/ok/blue_disk//xen/xen/common/device-tree/dom0less-build.c:1019)
   start_xen(unsigned long bootcpu_id, paddr_t dtb_addr) (/run/media/ok/blue_disk//xen/xen/arch/riscv/setup.c:296)
   start() (/run/media/ok/blue_disk//xen/xen/arch/riscv/riscv64/head.S:61)

And panic() almost at the end:
         rc = construct_domU(d, node);
         if ( rc )
             panic("Could not set up domain %s (rc = %d)\n",
                   dt_node_name(node), rc);

>
>> 2. If a partial mapping occurs during the lifetime of a domain, for example, if the domain
>>      requests to map some memory, we return the number of successfully mapped GFNs and let the
>>      domain decide what to do: either remove the mappings or retry mapping the remaining part.
>>      However, I think there's not much value in retrying, since p2m_set_entry() is likely to
>>      fail again. So, perhaps the best course of action is to stop the domain altogether.
>> Does that make sense?
> Sure, why not. Provided you actually have a way to communicate back how much
> was mapped.

I was thinking of simply returning it as the return value. This way, a return value of 0 would
indicate that everything was mapped successfully, while a value greater than 0 would indicate
how many GFNs were successfully mapped. And negative value if nothing was be mapped at all:
     static int p2m_set_entry(struct p2m_domain *p2m,
                            gfn_t sgfn,
                            unsigned long nr,
                            mfn_t smfn,
                            p2m_type_t t,
                            p2m_access_t a)
    {
       int rc = 0;
       unsigned int i;
   
       ...
   
       for ( i = 1; i <= nr; i++ )
       {
           ...
           rc = __p2m_set_entry(p2m, sgfn, order, smfn, t, a);
           if ( rc )
               break;
           ...
       }
       
       return i == nr ? 0 : i ?: rc;
    }

~ Oleksii

Re: [PATCH v2 11/17] xen/riscv: implement p2m_set_entry() and __p2m_set_entry()

Posted by Jan Beulich 3 months, 3 weeks ago

On 09.07.2025 10:24, Oleksii Kurochko wrote:
> 
> On 7/8/25 6:04 PM, Jan Beulich wrote:
>> On 08.07.2025 17:42, Oleksii Kurochko wrote:
>>> On 7/8/25 2:45 PM, Jan Beulich wrote:
>>>> On 08.07.2025 12:37, Oleksii Kurochko wrote:
>>>>> On 7/8/25 11:01 AM, Oleksii Kurochko wrote:
>>>>>> On 7/8/25 9:10 AM, Jan Beulich wrote:
>>>>>>> On 07.07.2025 18:10, Oleksii Kurochko wrote:
>>>>>>>> On 7/7/25 5:15 PM, Jan Beulich wrote:
>>>>>>>>> On 07.07.2025 17:00, Oleksii Kurochko wrote:
>>>>>>>>>> On 7/7/25 2:53 PM, Jan Beulich wrote:
>>>>>>>>>>> On 07.07.2025 13:46, Oleksii Kurochko wrote:
>>>>>>>>>>>> On 7/7/25 9:20 AM, Jan Beulich wrote:
>>>>>>>>>>>>> On 04.07.2025 17:01, Oleksii Kurochko wrote:
>>>>>>>>>>>>>> On 7/1/25 3:49 PM, Jan Beulich wrote:
>>>>>>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>> +    panic("%s: isn't implemented for now\n", __func__);
>>>>>>>>>>>>>>>> +
>>>>>>>>>>>>>>>> +    return false;
>>>>>>>>>>>>>>>> +}
>>>>>>>>>>>>>>> For this function in particular, though: Besides the "p2me" in the name
>>>>>>>>>>>>>>> being somewhat odd (supposedly page table entries here are simply pte_t),
>>>>>>>>>>>>>>> how is this going to be different from pte_is_valid()?
>>>>>>>>>>>>>> pte_is_valid() is checking a real bit of PTE, but p2me_is_valid() is checking
>>>>>>>>>>>>>> what is a type stored in the radix tree (p2m->p2m_types):
>>>>>>>>>>>>>>          /*
>>>>>>>>>>>>>>           * In the case of the P2M, the valid bit is used for other purpose. Use
>>>>>>>>>>>>>>           * the type to check whether an entry is valid.
>>>>>>>>>>>>>>           */
>>>>>>>>>>>>>>          static inline bool p2me_is_valid(struct p2m_domain *p2m, pte_t pte)
>>>>>>>>>>>>>>          {
>>>>>>>>>>>>>>              return p2m_type_radix_get(p2m, pte) != p2m_invalid;
>>>>>>>>>>>>>>          }
>>>>>>>>>>>>>>
>>>>>>>>>>>>>> It is done to track which page was modified by a guest.
>>>>>>>>>>>>> But then (again) the name doesn't convey what the function does.
>>>>>>>>>>>> Then probably p2me_type_is_valid(struct p2m_domain *p2m, pte_t pte) would better.
>>>>>>>>>>> For P2M type checks please don't invent new naming, but use what both x86
>>>>>>>>>>> and Arm are already using. Note how we already have p2m_is_valid() in that
>>>>>>>>>>> set. Just that it's not doing what you want here.
>>>>>>>>>> Hm, why not doing what I want? p2m_is_valid() verifies if P2M entry is valid.
>>>>>>>>>> And in here it is checked if P2M pte is valid from P2M point of view by checking
>>>>>>>>>> the type in radix tree and/or in reserved PTEs bits (just to remind we have only 2
>>>>>>>>>> free bits for type).
>>>>>>>>> Because this is how it's defined on x86:
>>>>>>>>>
>>>>>>>>> #define p2m_is_valid(_t)    (p2m_to_mask(_t) & \
>>>>>>>>>                                  (P2M_RAM_TYPES | p2m_to_mask(p2m_mmio_direct)))
>>>>>>>>>
>>>>>>>>> I.e. more strict that simply "!= p2m_invalid". And I think such predicates
>>>>>>>>> would better be uniform across architectures, such that in principle they
>>>>>>>>> might also be usable in common code (as we already do with p2m_is_foreign()).
>>>>>>>> Yeah, Arm isn't so strict in definition of p2m_is_valid() and it seems like
>>>>>>>> x86 and Arm have different understanding what is valid.
>>>>>>>>
>>>>>>>> Except what mentioned in the comment that grant types aren't considered valid
>>>>>>>> for x86 (and shouldn't be the same then for Arm?), it isn't clear why x86's
>>>>>>>> p2m_is_valid() is stricter then Arm's one and if other arches should be also
>>>>>>>> so strict.
>>>>>>> Arm's p2m_is_valid() is entirely different (and imo misnamed, but arguably one
>>>>>>> could also consider x86'es to require a better name). It's a local helper, not
>>>>>>> a P2M type checking predicate. With that in mind, you may of course follow
>>>>>>> Arm's model, but in the longer run we may need to do something about the name
>>>>>>> collision then.
>>>>>>>
>>>>>>>>>> The only use case I can think of is that the caller
>>>>>>>>>> might try to map the remaining GFNs again. But that doesn’t seem very useful,
>>>>>>>>>> if|p2m_set_entry()| wasn’t able to map the full range, it likely indicates a serious
>>>>>>>>>> issue, and retrying would probably result in the same error.
>>>>>>>>>>
>>>>>>>>>> The same applies to rolling back the state. It wouldn’t be difficult to add a local
>>>>>>>>>> array to track all modified PTEs and then use it to revert the state if needed.
>>>>>>>>>> But again, what would the caller do after the rollback? At this point, it still seems
>>>>>>>>>> like the best option is simply to|panic(). |
>>>>>>>>>>
>>>>>>>>>> Basically, I don’t see or understand the cases where knowing how many GFNs were
>>>>>>>>>> successfully mapped, or whether a rollback was performed, would really help — because
>>>>>>>>>> in most cases, I don’t have a better option than just calling|panic()| at the end.
>>>>>>>>> panic()-ing is of course only a last resort. Anything related to domain handling
>>>>>>>>> would better crash only the domain in question. And even that only if suitable
>>>>>>>>> error handling isn't possible.
>>>>>>>> And if there is no still any runnable domain available, for example, we are creating
>>>>>>>> domain and some p2m mapping is called? Will it be enough just ignore to boot this domain?
>>>>>>>> If yes, then it is enough to return only error code without returning how many GFNs were
>>>>>>>> mapped or rollbacking as domain won't be ran anyway.
>>>>>>> During domain creation all you need to do is return an error. But when you write a
>>>>>>> generic function that's also (going to be) used at domain runtime, you need to
>>>>>>> consider what to do there in case of partial success.
>>>>>>>
>>>>>>>>>> For example, if I call|map_regions_p2mt()| for an MMIO region described in a device
>>>>>>>>>> tree node, and the mapping fails partway through, I’m left with two options: either
>>>>>>>>>> ignore the device (if it's not essential for Xen or guest functionality) and continue
>>>>>>>>>>       booting; in which case I’d need to perform a rollback, and simply knowing the number
>>>>>>>>>> of successfully mapped GFNs may not be enough or, more likely, just panic.
>>>>>>>>> Well, no. For example, before even trying to map you could check that the range
>>>>>>>>> of P2M entries covered is all empty.
>>>>>>>> Could it be that they aren't all empty? Then it seems like we have overlapping and we can't
>>>>>>>> just do a mapping, right?
>>>>>>> Possibly that would simply mean to return an error, yes.
>>>>>>>
>>>>>>>> Won't be this procedure consume a lot of time as it is needed to go through each page
>>>>>>>> tables for each entry.
>>>>>>> Well, you're free to suggest a clean alternative without doing so.
>>>>>> I thought about dynamically allocating an array in p2m_set_entry(), where to save all changed PTEs,
>>>>>> and then use it to roll back if __p2m_set_entry() returns rc != 0 ...
>>>> That's another possible source for failure, and such an allocation may end
>>>> up being a rather big one.
>>>>
>>>>>>>>>      _Then_ you know how to correctly roll back.
>>>>>>>>> And yes, doing so may not even require passing back information on how much of
>>>>>>>>> a region was successfully mapped.
>>>>>>>> If P2M entries were empty before start of the mapping then it is enough to just go
>>>>>>>> through the same range (sgfn,nr,smfn) and just clean them, right?
>>>>>>> Yes, what else would "roll back" mean in that case?
>>>>>> ... If we know that the P2M entries were empty, then there's nothing else to be done, just
>>>>>> clean PTE is needed to be done.
>>>>>> However, if the P2M entries weren’t empty (and I’m still not sure whether that’s a legal
>>>>>> case), then rolling back would mean restoring their original state, the state they
>>>>>> had before the P2M mapping procedure started.
>>>>> Possible roll back is harder to implement as expected because there is a case where subtree
>>>>> could be freed:
>>>>>        /*
>>>>>         * Free the entry only if the original pte was valid and the base
>>>>>         * is different (to avoid freeing when permission is changed).
>>>>>         */
>>>>>        if ( p2me_is_valid(p2m, orig_pte) &&
>>>>>             !mfn_eq(pte_get_mfn(*entry), pte_get_mfn(orig_pte)) )
>>>>>            p2m_free_subtree(p2m, orig_pte, level);
>>>>> In this case then it will be needed to store the full subtree.
>>>> Right, which is why it may be desirable to limit the ability to update multiple
>>>> entries in one go. Or work from certain assumptions, violation of which would
>>>> cause the domain to be crashed.
>>> It seems to me that the main issue with updating multiple entries in one go is the rollback
>>> mechanism in case of a partial mapping failure. (other issues? mapping could consume a lot
>>> of time so something should wait while allocation will end?) In my opinion, the rollback
>>> mechanism is quite complex to implement and could become a source of further failures.
>>> For example, most of the cases where p2m_set_entry() could fail are due to failure in
>>> mapping the page table (to allow Xen to walk through it) or failure in creating a new page
>>> table due to memory exhaustion. Then, during rollback, which might also require memory
>>> allocation, we could face the same memory shortage issue.
>>> And what should be done in that case?
>>>
>>> In my opinion, the best option is to simply return from p2m_set_entry() the number of
>>> successfully mapped GFNs (stored in rc which is returned by p2m_set_entry()) and let
>>> the caller decide how to handle the partial mapping:
>>> 1. If a partial mapping occurs during domain creation, we could just report that this
>>>      domain can't be created and continue without it if there are other domains to start;
>>>      otherwise, panic.
>> I don't see how panic()-ing is relevant here. That's to be decided (far) up
>> the call stack.
> 
> So it's just a question of whether the caller should panic() or propagate the return
> value (error code) up the call stack.
> 
> For example, in case of domain construction return value is propogate almost to  the top
> of the stack:
>    p2m_set_entry(p2m_access_t a, p2m_type_t t, mfn_t smfn, unsigned long nr, gfn_t sgfn, struct p2m_domain * p2m) (/run/media/ok/blue_disk//xen/xen/arch/riscv/p2m.c:1005)
>    p2m_insert_mapping(struct domain * d, gfn_t start_gfn, unsigned long nr, mfn_t mfn, p2m_type_t t) (/run/media/ok/blue_disk//xen/xen/arch/riscv/p2m.c:1055)
>    guest_physmap_add_entry(struct domain * d, gfn_t gfn, mfn_t mfn, unsigned long page_order, p2m_type_t t) (/run/media/ok/blue_disk//xen/xen/arch/riscv/p2m.c:1076)
>    guest_physmap_add_page(unsigned int page_order, struct domain * d) (/run/media/ok/blue_disk//xen/xen/arch/riscv/include/asm/p2m.h:152)
>    guest_map_pages(struct domain * d, struct page_info * pg, unsigned int order, void * extra) (/run/media/ok/blue_disk//xen/xen/common/device-tree/domain-build.c:63)
>    allocate_domheap_memory(struct domain * d, paddr_t tot_size, alloc_domheap_mem_cb cb, void * extra) (/run/media/ok/blue_disk//xen/xen/common/device-tree/domain-build.c:47)
>    allocate_bank_memory(struct kernel_info * kinfo, gfn_t sgfn, paddr_t tot_size) (/run/media/ok/blue_disk//xen/xen/common/device-tree/domain-build.c:99)
>    allocate_memory(struct domain * d, struct kernel_info * kinfo) (/run/media/ok/blue_disk//xen/xen/include/xen/mm-frame.h:43)
>    construct_domU(struct domain * d, const struct dt_device_node * node) (/run/media/ok/blue_disk//xen/xen/common/device-tree/dom0less-build.c:835)
>    create_domUs() (/run/media/ok/blue_disk//xen/xen/common/device-tree/dom0less-build.c:1019)
>    start_xen(unsigned long bootcpu_id, paddr_t dtb_addr) (/run/media/ok/blue_disk//xen/xen/arch/riscv/setup.c:296)
>    start() (/run/media/ok/blue_disk//xen/xen/arch/riscv/riscv64/head.S:61)
> 
> And panic() almost at the end:
>          rc = construct_domU(d, node);
>          if ( rc )
>              panic("Could not set up domain %s (rc = %d)\n",
>                    dt_node_name(node), rc);

Which is what is wanted, imo.

Jan