[v2] xen/riscv: introduce p2m functionality

[PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 4 months, 3 weeks ago

This patch adds the initial logic for constructing PTEs from MFNs in the RISC-V
p2m subsystem. It includes:
- Implementation of p2m_entry_from_mfn(): Generates a valid PTE using the
  given MFN, p2m_type_t, and p2m_access_t, including permission encoding and
  PBMT attribute setup.
- New helper p2m_set_permission(): Encodes access rights (r, w, x) into the
  PTE based on both p2m type and access permissions.
- p2m_type_radix_set(): Stores the p2m type in a radix tree keyed by the PTE
  for later retrieval.

PBMT type encoding support:
- Introduces an enum pbmt_type_t to represent the PBMT field values.
- Maps types like p2m_mmio_direct_dev to pbmt_io, others default to pbmt_pma.

Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
---
Changes in V2:
 - New patch. It was a part of a big patch "xen/riscv: implement p2m mapping
   functionality" which was splitted to smaller.
---
 xen/arch/riscv/include/asm/page.h |   8 +++
 xen/arch/riscv/p2m.c              | 103 ++++++++++++++++++++++++++++--
 2 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/xen/arch/riscv/include/asm/page.h b/xen/arch/riscv/include/asm/page.h
index c67b9578c9..1d1054fa5c 100644
--- a/xen/arch/riscv/include/asm/page.h
+++ b/xen/arch/riscv/include/asm/page.h
@@ -76,6 +76,14 @@
 #define PTE_SMALL       BIT(10, UL)
 #define PTE_POPULATE    BIT(11, UL)
 
+enum pbmt_type_t {
+    pbmt_pma,
+    pbmt_nc,
+    pbmt_io,
+    pbmt_rsvd,
+    pbmt_max,
+};
+
 #define PTE_ACCESS_MASK (PTE_READABLE | PTE_WRITABLE | PTE_EXECUTABLE)
 
 #define PTE_PBMT_MASK   (PTE_PBMT_NOCACHE | PTE_PBMT_IO)
diff --git a/xen/arch/riscv/p2m.c b/xen/arch/riscv/p2m.c
index 6b11e87b22..cba04acf38 100644
--- a/xen/arch/riscv/p2m.c
+++ b/xen/arch/riscv/p2m.c
@@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
     return __map_domain_page(p2m->root + root_table_indx);
 }
 
+static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
+{
+    int rc;
+    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
+
+    rc = radix_tree_insert(&p2m->p2m_type, gfn_x(gfn),
+                           radix_tree_int_to_ptr(t));
+    if ( rc == -EEXIST )
+    {
+        /* If a setting already exists, change it to the new one */
+        radix_tree_replace_slot(
+            radix_tree_lookup_slot(
+                &p2m->p2m_type, gfn_x(gfn)),
+            radix_tree_int_to_ptr(t));
+        rc = 0;
+    }
+
+    return rc;
+}
+
 static p2m_type_t p2m_type_radix_get(struct p2m_domain *p2m, pte_t pte)
 {
     void *ptr;
@@ -389,12 +409,87 @@ static inline void p2m_remove_pte(pte_t *p, bool clean_pte)
     p2m_write_pte(p, pte, clean_pte);
 }
 
-static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn,
-                                p2m_type_t t, p2m_access_t a)
+static void p2m_set_permission(pte_t *e, p2m_type_t t, p2m_access_t a)
 {
-    panic("%s: hasn't been implemented yet\n", __func__);
+    /* First apply type permissions */
+    switch ( t )
+    {
+    case p2m_ram_rw:
+        e->pte |= PTE_ACCESS_MASK;
+        break;
+
+    case p2m_mmio_direct_dev:
+        e->pte |= (PTE_READABLE | PTE_WRITABLE);
+        e->pte &= ~PTE_EXECUTABLE;
+        break;
+
+    case p2m_invalid:
+        e->pte &= ~PTE_ACCESS_MASK;
+        break;
+
+    default:
+        BUG();
+        break;
+    }
+
+    /* Then restrict with access permissions */
+    switch ( a )
+    {
+    case p2m_access_rwx:
+        break;
+    case p2m_access_wx:
+        e->pte &= ~PTE_READABLE;
+        break;
+    case p2m_access_rw:
+        e->pte &= ~PTE_EXECUTABLE;
+        break;
+    case p2m_access_w:
+        e->pte &= ~(PTE_READABLE | PTE_EXECUTABLE);
+        e->pte &= ~PTE_EXECUTABLE;
+        break;
+    case p2m_access_rx:
+    case p2m_access_rx2rw:
+        e->pte &= ~PTE_WRITABLE;
+        break;
+    case p2m_access_x:
+        e->pte &= ~(PTE_READABLE | PTE_WRITABLE);
+        break;
+    case p2m_access_r:
+        e->pte &= ~(PTE_WRITABLE | PTE_EXECUTABLE);
+        break;
+    case p2m_access_n:
+    case p2m_access_n2rwx:
+        e->pte &= ~PTE_ACCESS_MASK;
+        break;
+    default:
+        BUG();
+        break;
+    }
+}
+
+static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn, p2m_type_t t, p2m_access_t a)
+{
+    pte_t e = (pte_t) { 1 };
+
+    switch ( t )
+    {
+    case p2m_mmio_direct_dev:
+        e.pte |= PTE_PBMT_IO;
+        break;
+
+    default:
+        break;
+    }
+
+    p2m_set_permission(&e, t, a);
+
+    ASSERT(!(mfn_to_maddr(mfn) & ~PADDR_MASK));
+
+    pte_set_mfn(&e, mfn);
+
+    BUG_ON(p2m_type_radix_set(p2m, e, t));
 
-    return (pte_t) { .pte = 0 };
+    return e;
 }
 
 #define GUEST_TABLE_MAP_NONE 0
-- 
2.49.0

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 4 months ago

On 10.06.2025 15:05, Oleksii Kurochko wrote:
> --- a/xen/arch/riscv/include/asm/page.h
> +++ b/xen/arch/riscv/include/asm/page.h
> @@ -76,6 +76,14 @@
>  #define PTE_SMALL       BIT(10, UL)
>  #define PTE_POPULATE    BIT(11, UL)
>  
> +enum pbmt_type_t {

Please can we stick to _t suffixes only being used on typedef-ed identifiers?

> +    pbmt_pma,
> +    pbmt_nc,
> +    pbmt_io,
> +    pbmt_rsvd,
> +    pbmt_max,

It's a 2-bit field in the PTE, isn't it? In which case the maximum valid value
to put there is 3. That's what an identifier named "max" should evaluate to.
The value 4 here would want to be named "count", "num", "nr", or alike.

> --- a/xen/arch/riscv/p2m.c
> +++ b/xen/arch/riscv/p2m.c
> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>      return __map_domain_page(p2m->root + root_table_indx);
>  }
>  
> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)

See comments on the earlier patch regarding naming.

> +{
> +    int rc;
> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));

How does this work, when you record GFNs only for Xenheap pages? I don't
think you can get around having the caller pass in the GFN. At which point
the PTE probably doesn't need passing.

> +    rc = radix_tree_insert(&p2m->p2m_type, gfn_x(gfn),
> +                           radix_tree_int_to_ptr(t));
> +    if ( rc == -EEXIST )
> +    {
> +        /* If a setting already exists, change it to the new one */
> +        radix_tree_replace_slot(
> +            radix_tree_lookup_slot(
> +                &p2m->p2m_type, gfn_x(gfn)),
> +            radix_tree_int_to_ptr(t));
> +        rc = 0;
> +    }
> +
> +    return rc;
> +}
> +
>  static p2m_type_t p2m_type_radix_get(struct p2m_domain *p2m, pte_t pte)
>  {
>      void *ptr;
> @@ -389,12 +409,87 @@ static inline void p2m_remove_pte(pte_t *p, bool clean_pte)
>      p2m_write_pte(p, pte, clean_pte);
>  }
>  
> -static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn,
> -                                p2m_type_t t, p2m_access_t a)
> +static void p2m_set_permission(pte_t *e, p2m_type_t t, p2m_access_t a)
>  {
> -    panic("%s: hasn't been implemented yet\n", __func__);
> +    /* First apply type permissions */
> +    switch ( t )
> +    {
> +    case p2m_ram_rw:
> +        e->pte |= PTE_ACCESS_MASK;
> +        break;
> +
> +    case p2m_mmio_direct_dev:
> +        e->pte |= (PTE_READABLE | PTE_WRITABLE);
> +        e->pte &= ~PTE_EXECUTABLE;

What's wrong with code living in MMIO, e.g. in the ROM of a PCI device?
Such code would want to be executable.

> +        break;
> +
> +    case p2m_invalid:
> +        e->pte &= ~PTE_ACCESS_MASK;
> +        break;
> +
> +    default:
> +        BUG();
> +        break;
> +    }

I think you ought to handle all types that are defined right away. I also
don't think you should BUG() in the default case (also in the other switch()
below). ASSERT_UNEACHABLE() may be fine, along with clearing all permissions
in the entry for release builds.

> +    /* Then restrict with access permissions */
> +    switch ( a )
> +    {
> +    case p2m_access_rwx:
> +        break;
> +    case p2m_access_wx:
> +        e->pte &= ~PTE_READABLE;
> +        break;
> +    case p2m_access_rw:
> +        e->pte &= ~PTE_EXECUTABLE;
> +        break;
> +    case p2m_access_w:
> +        e->pte &= ~(PTE_READABLE | PTE_EXECUTABLE);
> +        e->pte &= ~PTE_EXECUTABLE;
> +        break;
> +    case p2m_access_rx:
> +    case p2m_access_rx2rw:
> +        e->pte &= ~PTE_WRITABLE;
> +        break;
> +    case p2m_access_x:
> +        e->pte &= ~(PTE_READABLE | PTE_WRITABLE);
> +        break;
> +    case p2m_access_r:
> +        e->pte &= ~(PTE_WRITABLE | PTE_EXECUTABLE);
> +        break;
> +    case p2m_access_n:
> +    case p2m_access_n2rwx:
> +        e->pte &= ~PTE_ACCESS_MASK;
> +        break;
> +    default:
> +        BUG();
> +        break;
> +    }

Nit: Blank lines between non-fall-through case blocks, please.

> +static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn, p2m_type_t t, p2m_access_t a)
> +{
> +    pte_t e = (pte_t) { 1 };

What's the 1 doing here?

> +    switch ( t )
> +    {
> +    case p2m_mmio_direct_dev:
> +        e.pte |= PTE_PBMT_IO;
> +        break;
> +
> +    default:
> +        break;
> +    }
> +
> +    p2m_set_permission(&e, t, a);
> +
> +    ASSERT(!(mfn_to_maddr(mfn) & ~PADDR_MASK));
> +
> +    pte_set_mfn(&e, mfn);

Based on how things work on x86 (and how I would have expected them to also
work on Arm), may I suggest that you set MFN ahead of permissions, so that
the permissions setting function can use the MFN for e.g. a lookup in
mmio_ro_ranges.

> +    BUG_ON(p2m_type_radix_set(p2m, e, t));

I'm not convinced of this error handling here either. Radix tree insertion
_can_ fail, e.g. when there's no memory left. This must not bring down Xen,
or we'll have an XSA right away. You could zap the PTE, or if need be you
could crash the offending domain.

In this context (not sure if I asked before): With this use of a radix tree,
how do you intend to bound the amount of memory that a domain can use, by
making Xen insert very many entries?

Jan

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 3 months, 2 weeks ago

On 7/1/25 5:08 PM, Jan Beulich wrote:
> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>
>> --- a/xen/arch/riscv/p2m.c
>> +++ b/xen/arch/riscv/p2m.c
>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>       return __map_domain_page(p2m->root + root_table_indx);
>>   }
>>   
>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
> See comments on the earlier patch regarding naming.
>
>> +{
>> +    int rc;
>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
> How does this work, when you record GFNs only for Xenheap pages?

I think I don't understand what is an issue. Could you please provide
some extra details?

> I don't
> think you can get around having the caller pass in the GFN. At which point
> the PTE probably doesn't need passing.

It’s an option. PTE argument, I think, we still need because as we discussed
earlier, partly some P2M types will be stored in PTE bits.

I’m also wondering whether the MFN could be used to identify the P2M PTE’s type,
or if, in general, it isn’t unique (since different GFNs can map to the same MFN),
meaning it can't reliably be used to determine the PTE’s type. Right?

>
>> +    rc = radix_tree_insert(&p2m->p2m_type, gfn_x(gfn),
>> +                           radix_tree_int_to_ptr(t));
>> +    if ( rc == -EEXIST )
>> +    {
>> +        /* If a setting already exists, change it to the new one */
>> +        radix_tree_replace_slot(
>> +            radix_tree_lookup_slot(
>> +                &p2m->p2m_type, gfn_x(gfn)),
>> +            radix_tree_int_to_ptr(t));
>> +        rc = 0;
>> +    }
>> +
>> +    return rc;
>> +}
>> +
>>   static p2m_type_t p2m_type_radix_get(struct p2m_domain *p2m, pte_t pte)
>>   {
>>       void *ptr;
>> @@ -389,12 +409,87 @@ static inline void p2m_remove_pte(pte_t *p, bool clean_pte)
>>       p2m_write_pte(p, pte, clean_pte);
>>   }
>>   
>> -static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn,
>> -                                p2m_type_t t, p2m_access_t a)
>> +static void p2m_set_permission(pte_t *e, p2m_type_t t, p2m_access_t a)
>>   {
>> -    panic("%s: hasn't been implemented yet\n", __func__);
>> +    /* First apply type permissions */
>> +    switch ( t )
>> +    {
>> +    case p2m_ram_rw:
>> +        e->pte |= PTE_ACCESS_MASK;
>> +        break;
>> +
>> +    case p2m_mmio_direct_dev:
>> +        e->pte |= (PTE_READABLE | PTE_WRITABLE);
>> +        e->pte &= ~PTE_EXECUTABLE;
> What's wrong with code living in MMIO, e.g. in the ROM of a PCI device?
> Such code would want to be executable.

I think you are right and nothing wrong with code living in MMIO.

According to the spec:
   I/O regions can specify which combinations of read, write, or execute accesses
   to which data widths are supported.

>> +        break;
>> +
>> +    case p2m_invalid:
>> +        e->pte &= ~PTE_ACCESS_MASK;
>> +        break;
>> +
>> +    default:
>> +        BUG();
>> +        break;
>> +    }
> I think you ought to handle all types that are defined right away. I also
> don't think you should BUG() in the default case (also in the other switch()
> below). ASSERT_UNEACHABLE() may be fine, along with clearing all permissions
> in the entry for release builds.
>
>> +    /* Then restrict with access permissions */
>> +    switch ( a )
>> +    {
>> +    case p2m_access_rwx:
>> +        break;
>> +    case p2m_access_wx:
>> +        e->pte &= ~PTE_READABLE;
>> +        break;
>> +    case p2m_access_rw:
>> +        e->pte &= ~PTE_EXECUTABLE;
>> +        break;
>> +    case p2m_access_w:
>> +        e->pte &= ~(PTE_READABLE | PTE_EXECUTABLE);
>> +        e->pte &= ~PTE_EXECUTABLE;
>> +        break;
>> +    case p2m_access_rx:
>> +    case p2m_access_rx2rw:
>> +        e->pte &= ~PTE_WRITABLE;
>> +        break;
>> +    case p2m_access_x:
>> +        e->pte &= ~(PTE_READABLE | PTE_WRITABLE);
>> +        break;
>> +    case p2m_access_r:
>> +        e->pte &= ~(PTE_WRITABLE | PTE_EXECUTABLE);
>> +        break;
>> +    case p2m_access_n:
>> +    case p2m_access_n2rwx:
>> +        e->pte &= ~PTE_ACCESS_MASK;
>> +        break;
>> +    default:
>> +        BUG();
>> +        break;
>> +    }
> Nit: Blank lines between non-fall-through case blocks, please.
>
>> +static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn, p2m_type_t t, p2m_access_t a)
>> +{
>> +    pte_t e = (pte_t) { 1 };
> What's the 1 doing here?

Set valid bit of PTE to 1.

>
>> +    switch ( t )
>> +    {
>> +    case p2m_mmio_direct_dev:
>> +        e.pte |= PTE_PBMT_IO;
>> +        break;
>> +
>> +    default:
>> +        break;
>> +    }
>> +
>> +    p2m_set_permission(&e, t, a);
>> +
>> +    ASSERT(!(mfn_to_maddr(mfn) & ~PADDR_MASK));
>> +
>> +    pte_set_mfn(&e, mfn);
> Based on how things work on x86 (and how I would have expected them to also
> work on Arm), may I suggest that you set MFN ahead of permissions, so that
> the permissions setting function can use the MFN for e.g. a lookup in
> mmio_ro_ranges.

Sure, just a note that on Arm, the MFN is set last.

>
>> +    BUG_ON(p2m_type_radix_set(p2m, e, t));
> I'm not convinced of this error handling here either. Radix tree insertion
> _can_ fail, e.g. when there's no memory left. This must not bring down Xen,
> or we'll have an XSA right away. You could zap the PTE, or if need be you
> could crash the offending domain.

IIUC what is "zap the PTE", then I will do in this way:
     if ( p2m_set_type(p2m, e, t) )
         e.pte = 0;

But then it will lead to an MMU failure—how is that expected to be handled?
There’s no guarantee that, at the moment of handling this exception, enough
memory will be available to set a type for the PTE and also there is not really
clear how to detect in exception handler that it is needed just to re-try to
set a type. Or should we just call|domain_crash()|?
In that case, it seems more reasonable to call|domain_crash() |immediately in
|p2m_pte_from_mfn().|

>
> In this context (not sure if I asked before): With this use of a radix tree,
> how do you intend to bound the amount of memory that a domain can use, by
> making Xen insert very many entries?

I didn’t think about that. I assumed it would be enough to set the amount of
memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.

Also, it seems this would just lead to the issue you mentioned earlier: when
the memory runs out,|domain_crash()| will be called or PTE will be zapped.

~ Oleksii

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 3 months, 2 weeks ago

On 15.07.2025 16:47, Oleksii Kurochko wrote:
> On 7/1/25 5:08 PM, Jan Beulich wrote:
>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>> --- a/xen/arch/riscv/p2m.c
>>> +++ b/xen/arch/riscv/p2m.c
>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>       return __map_domain_page(p2m->root + root_table_indx);
>>>   }
>>>   
>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>> See comments on the earlier patch regarding naming.
>>
>>> +{
>>> +    int rc;
>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>> How does this work, when you record GFNs only for Xenheap pages?
> 
> I think I don't understand what is an issue. Could you please provide
> some extra details?

Counter question: The mfn_to_gfn() you currently have is only a stub. It only
works for 1:1 mapped domains. Can you show me the eventual final implementation
of the function, making it possible to use it here? Having such stubs, and not
even annotated in any way, is imo a problem: People may thing they're fine to
use when really they aren't.

>>> +static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn, p2m_type_t t, p2m_access_t a)
>>> +{
>>> +    pte_t e = (pte_t) { 1 };
>> What's the 1 doing here?
> 
> Set valid bit of PTE to 1.

But something like this isn't to be done using a plain, unannotated literal
number. Aiui you mean PTE_VALID here.

>>> +    switch ( t )
>>> +    {
>>> +    case p2m_mmio_direct_dev:
>>> +        e.pte |= PTE_PBMT_IO;
>>> +        break;
>>> +
>>> +    default:
>>> +        break;
>>> +    }
>>> +
>>> +    p2m_set_permission(&e, t, a);
>>> +
>>> +    ASSERT(!(mfn_to_maddr(mfn) & ~PADDR_MASK));
>>> +
>>> +    pte_set_mfn(&e, mfn);
>> Based on how things work on x86 (and how I would have expected them to also
>> work on Arm), may I suggest that you set MFN ahead of permissions, so that
>> the permissions setting function can use the MFN for e.g. a lookup in
>> mmio_ro_ranges.
> 
> Sure, just a note that on Arm, the MFN is set last.

That's apparently because they (still) don't have mmio_ro_ranges. That's only
a latent issue (I hope) while they still don't have PCI support.

>>> +    BUG_ON(p2m_type_radix_set(p2m, e, t));
>> I'm not convinced of this error handling here either. Radix tree insertion
>> _can_ fail, e.g. when there's no memory left. This must not bring down Xen,
>> or we'll have an XSA right away. You could zap the PTE, or if need be you
>> could crash the offending domain.
> 
> IIUC what is "zap the PTE", then I will do in this way:
>      if ( p2m_set_type(p2m, e, t) )
>          e.pte = 0;
> 
> But then it will lead to an MMU failure—how is that expected to be handled?
> There’s no guarantee that, at the moment of handling this exception, enough
> memory will be available to set a type for the PTE and also there is not really
> clear how to detect in exception handler that it is needed just to re-try to
> set a type. Or should we just call|domain_crash()|?
> In that case, it seems more reasonable to call|domain_crash() |immediately in
> |p2m_pte_from_mfn().|

As said - crashing the domain in such an event is an option. The question
here is whether to do so right away, or whether to defer that in the hope
that the PTE may not actually be accessed (before being rewritten).

>> In this context (not sure if I asked before): With this use of a radix tree,
>> how do you intend to bound the amount of memory that a domain can use, by
>> making Xen insert very many entries?
> 
> I didn’t think about that. I assumed it would be enough to set the amount of
> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.

Which would require these allocations to come from that pool.

> Also, it seems this would just lead to the issue you mentioned earlier: when
> the memory runs out,|domain_crash()| will be called or PTE will be zapped.

Or one domain exhausting memory would cause another domain to fail. A domain
impacting just itself may be tolerable. But a domain affecting other domains
isn't.

Jan

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 3 months, 2 weeks ago

On 7/16/25 1:31 PM, Jan Beulich wrote:
> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>> --- a/xen/arch/riscv/p2m.c
>>>> +++ b/xen/arch/riscv/p2m.c
>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>        return __map_domain_page(p2m->root + root_table_indx);
>>>>    }
>>>>    
>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>> See comments on the earlier patch regarding naming.
>>>
>>>> +{
>>>> +    int rc;
>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>> How does this work, when you record GFNs only for Xenheap pages?


>> I think I don't understand what is an issue. Could you please provide
>> some extra details?
> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
> works for 1:1 mapped domains. Can you show me the eventual final implementation
> of the function, making it possible to use it here?

At the moment, I planned to support only 1:1 mapped domains, so it is final
implementation.

I think that I understand your initial question. So yes, at the moment, we have
only Xenheap pages and as for such pages we have stored GFNs it will be easy to
recover gfn for mfn, and so it will be easy to implement mfn_to_gfn() for Xenheap
pages.


>   Having such stubs, and not
> even annotated in any way, is imo a problem: People may thing they're fine to
> use when really they aren't.

Then more correct will be to pass GFN through an argument as you suggested earlier
(and I've already added such argument).

I just initially made incorrect suggestion that it is a question to an implementation
of mfn_to_gfn() to provide such implementation which supports any type of page.

>
>>>> +static pte_t p2m_entry_from_mfn(struct p2m_domain *p2m, mfn_t mfn, p2m_type_t t, p2m_access_t a)
>>>> +{
>>>> +    pte_t e = (pte_t) { 1 };
>>> What's the 1 doing here?
>> Set valid bit of PTE to 1.
> But something like this isn't to be done using a plain, unannotated literal
> number. Aiui you mean PTE_VALID here.

Yes. I will use PTE_VALID instead.

>
>>>> +    switch ( t )
>>>> +    {
>>>> +    case p2m_mmio_direct_dev:
>>>> +        e.pte |= PTE_PBMT_IO;
>>>> +        break;
>>>> +
>>>> +    default:
>>>> +        break;
>>>> +    }
>>>> +
>>>> +    p2m_set_permission(&e, t, a);
>>>> +
>>>> +    ASSERT(!(mfn_to_maddr(mfn) & ~PADDR_MASK));
>>>> +
>>>> +    pte_set_mfn(&e, mfn);
>>> Based on how things work on x86 (and how I would have expected them to also
>>> work on Arm), may I suggest that you set MFN ahead of permissions, so that
>>> the permissions setting function can use the MFN for e.g. a lookup in
>>> mmio_ro_ranges.
>> Sure, just a note that on Arm, the MFN is set last.
> That's apparently because they (still) don't have mmio_ro_ranges. That's only
> a latent issue (I hope) while they still don't have PCI support.
>
>>>> +    BUG_ON(p2m_type_radix_set(p2m, e, t));
>>> I'm not convinced of this error handling here either. Radix tree insertion
>>> _can_ fail, e.g. when there's no memory left. This must not bring down Xen,
>>> or we'll have an XSA right away. You could zap the PTE, or if need be you
>>> could crash the offending domain.
>> IIUC what is "zap the PTE", then I will do in this way:
>>       if ( p2m_set_type(p2m, e, t) )
>>           e.pte = 0;
>>
>> But then it will lead to an MMU failure—how is that expected to be handled?
>> There’s no guarantee that, at the moment of handling this exception, enough
>> memory will be available to set a type for the PTE and also there is not really
>> clear how to detect in exception handler that it is needed just to re-try to
>> set a type. Or should we just call|domain_crash()|?
>> In that case, it seems more reasonable to call|domain_crash() |immediately in
>> |p2m_pte_from_mfn().|
> As said - crashing the domain in such an event is an option. The question
> here is whether to do so right away, or whether to defer that in the hope
> that the PTE may not actually be accessed (before being rewritten).
>
>>> In this context (not sure if I asked before): With this use of a radix tree,
>>> how do you intend to bound the amount of memory that a domain can use, by
>>> making Xen insert very many entries?
>> I didn’t think about that. I assumed it would be enough to set the amount of
>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
> Which would require these allocations to come from that pool.

Yes, and it is true only for non-hardware domains with the current implementation.

>
>> Also, it seems this would just lead to the issue you mentioned earlier: when
>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
> Or one domain exhausting memory would cause another domain to fail. A domain
> impacting just itself may be tolerable. But a domain affecting other domains
> isn't.

But it seems like this issue could happen in any implementation. It won't happen only
if we will have only pre-populated pool for any domain type (hardware, control, guest
domain) without ability to extend them or allocate extra pages from domheap in runtime.
Otherwise, if extra pages allocation is allowed then we can't really do something
with this issue.


~ Oleksii

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 3 months, 2 weeks ago

On 16.07.2025 18:07, Oleksii Kurochko wrote:
> On 7/16/25 1:31 PM, Jan Beulich wrote:
>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>> --- a/xen/arch/riscv/p2m.c
>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>        return __map_domain_page(p2m->root + root_table_indx);
>>>>>    }
>>>>>    
>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>> See comments on the earlier patch regarding naming.
>>>>
>>>>> +{
>>>>> +    int rc;
>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>> How does this work, when you record GFNs only for Xenheap pages?
> 
> 
>>> I think I don't understand what is an issue. Could you please provide
>>> some extra details?
>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>> of the function, making it possible to use it here?
> 
> At the moment, I planned to support only 1:1 mapped domains, so it is final
> implementation.

Isn't that on overly severe limitation?

>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>> making Xen insert very many entries?
>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>> Which would require these allocations to come from that pool.
> 
> Yes, and it is true only for non-hardware domains with the current implementation.

???

>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>> Or one domain exhausting memory would cause another domain to fail. A domain
>> impacting just itself may be tolerable. But a domain affecting other domains
>> isn't.
> 
> But it seems like this issue could happen in any implementation. It won't happen only
> if we will have only pre-populated pool for any domain type (hardware, control, guest
> domain) without ability to extend them or allocate extra pages from domheap in runtime.
> Otherwise, if extra pages allocation is allowed then we can't really do something
> with this issue.

But that's why I brought this up: You simply have to. Or, as indicated, the
moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
This is the kind of thing you need to consider up front. Or at least mark with
a prominent FIXME annotation. All of which would need resolving before even
considering to mark code as supported.

Jan

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 3 months, 2 weeks ago

On 7/16/25 6:18 PM, Jan Beulich wrote:
> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>         return __map_domain_page(p2m->root + root_table_indx);
>>>>>>     }
>>>>>>     
>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>> See comments on the earlier patch regarding naming.
>>>>>
>>>>>> +{
>>>>>> +    int rc;
>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>
>>>> I think I don't understand what is an issue. Could you please provide
>>>> some extra details?
>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>> of the function, making it possible to use it here?
>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>> implementation.
> Isn't that on overly severe limitation?

I wouldn't say that it's a severe limitation, as it's just a matter of how
|mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
|mfn_to_gfn()| can be implemented differently, while the code where it’s called
will likely remain unchanged.

What I meant in my reply is that, for the current state and current limitations,
this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
see the value in, or the need for, non-1:1 mapped domains—it's just that this
limitation simplifies development at the current stage of the RISC-V port.

>
>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>> making Xen insert very many entries?
>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>> Which would require these allocations to come from that pool.
>> Yes, and it is true only for non-hardware domains with the current implementation.
> ???

I meant that pool is used now only for non-hardware domains at the moment.

>
>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>> impacting just itself may be tolerable. But a domain affecting other domains
>>> isn't.
>> But it seems like this issue could happen in any implementation. It won't happen only
>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>> Otherwise, if extra pages allocation is allowed then we can't really do something
>> with this issue.
> But that's why I brought this up: You simply have to. Or, as indicated, the
> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.

Why it isn't XSA for other architectures? At least, Arm then should have such
XSA.
I don't understand why x86 won't have the same issue. Memory is the limited
and shared resource, so if one of the domain will use to much memory then it could
happen that other domains won't have enough memory for its purpose...

> This is the kind of thing you need to consider up front. Or at least mark with
> a prominent FIXME annotation. All of which would need resolving before even
> considering to mark code as supported.

... At the moment, I’m trying to understand if this issue can be solved properly at
all when a domain is allowed to request or map extra memory for its own purposes.

The only solution I see is that each domain—regardless of its type—should have its
own pre-populated pools. This way, during construction, we’ll know whether the
domain can be created or if we’ve run out of memory, which would mean that no
more domains can be launched.
And if in runtime of a domain there is no free pages in a pre-populated pool then
just stop a domain (or return rc to a domain that there is no memory anymore and
let a domain to decide what it should do), otherwise if I will start to allocate
extra memory for domain which doesn't have free pages in a pool, it could lead
to the XSA issue you mentioned that one domain could exhaust memory so another
domain, at least, won't be able to allocate extra pages (in the case this another
domain also doesn't have free pages in a pool).

~ Oleksii

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 3 months, 2 weeks ago

On 17.07.2025 10:56, Oleksii Kurochko wrote:
> On 7/16/25 6:18 PM, Jan Beulich wrote:
>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>         return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>     }
>>>>>>>     
>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>> See comments on the earlier patch regarding naming.
>>>>>>
>>>>>>> +{
>>>>>>> +    int rc;
>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>
>>>>> I think I don't understand what is an issue. Could you please provide
>>>>> some extra details?
>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>> of the function, making it possible to use it here?
>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>> implementation.
>> Isn't that on overly severe limitation?
> 
> I wouldn't say that it's a severe limitation, as it's just a matter of how
> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
> will likely remain unchanged.
> 
> What I meant in my reply is that, for the current state and current limitations,
> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
> see the value in, or the need for, non-1:1 mapped domains—it's just that this
> limitation simplifies development at the current stage of the RISC-V port.

Simplification is fine in some cases, but not supporting the "normal" way of
domain construction looks like a pretty odd restriction. I'm also curious
how you envision to implement mfn_to_gfn() then, suitable for generic use like
the one here. Imo, current limitation or not, you simply want to avoid use of
that function outside of the special gnttab case.

>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>> making Xen insert very many entries?
>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>> Which would require these allocations to come from that pool.
>>> Yes, and it is true only for non-hardware domains with the current implementation.
>> ???
> 
> I meant that pool is used now only for non-hardware domains at the moment.

And how does this matter here? The memory required for the radix tree doesn't
come from that pool anyway.

>>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>>> impacting just itself may be tolerable. But a domain affecting other domains
>>>> isn't.
>>> But it seems like this issue could happen in any implementation. It won't happen only
>>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>>> Otherwise, if extra pages allocation is allowed then we can't really do something
>>> with this issue.
>> But that's why I brought this up: You simply have to. Or, as indicated, the
>> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
> 
> Why it isn't XSA for other architectures? At least, Arm then should have such
> XSA.

Does Arm use a radix tree for storing types? It uses one for mem-access, but
it's not clear to me whether that's actually a supported feature.

> I don't understand why x86 won't have the same issue. Memory is the limited
> and shared resource, so if one of the domain will use to much memory then it could
> happen that other domains won't have enough memory for its purpose...

The question is whether allocations are bounded. With this use of a radix tree,
you give domains a way to have Xen allocate pretty much arbitrary amounts of
memory to populate that tree. That unbounded-ness is the problem, not memory
allocations in general.

Jan

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 3 months, 2 weeks ago

On 7/17/25 12:25 PM, Jan Beulich wrote:
> On 17.07.2025 10:56, Oleksii Kurochko wrote:
>> On 7/16/25 6:18 PM, Jan Beulich wrote:
>>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>>          return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>>      }
>>>>>>>>      
>>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>>> See comments on the earlier patch regarding naming.
>>>>>>>
>>>>>>>> +{
>>>>>>>> +    int rc;
>>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>>>> I think I don't understand what is an issue. Could you please provide
>>>>>> some extra details?
>>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>>> of the function, making it possible to use it here?
>>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>>> implementation.
>>> Isn't that on overly severe limitation?
>> I wouldn't say that it's a severe limitation, as it's just a matter of how
>> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
>> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
>> will likely remain unchanged.
>>
>> What I meant in my reply is that, for the current state and current limitations,
>> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
>> see the value in, or the need for, non-1:1 mapped domains—it's just that this
>> limitation simplifies development at the current stage of the RISC-V port.
> Simplification is fine in some cases, but not supporting the "normal" way of
> domain construction looks like a pretty odd restriction. I'm also curious
> how you envision to implement mfn_to_gfn() then, suitable for generic use like
> the one here. Imo, current limitation or not, you simply want to avoid use of
> that function outside of the special gnttab case.
>
>>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>>> making Xen insert very many entries?
>>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>>> Which would require these allocations to come from that pool.
>>>> Yes, and it is true only for non-hardware domains with the current implementation.
>>> ???
>> I meant that pool is used now only for non-hardware domains at the moment.
> And how does this matter here? The memory required for the radix tree doesn't
> come from that pool anyway.

I thought that is possible to do that somehow, but looking at a code of
radix-tree.c it seems like the only one way to allocate memroy for the radix
tree isradix_tree_node_alloc() -> xzalloc(struct rcu_node).

Then it is needed to introduce radix_tree_node_allocate(domain) or radix tree
can't be used at all for mentioned in the previous replies security reason, no?


>>>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>>>> impacting just itself may be tolerable. But a domain affecting other domains
>>>>> isn't.
>>>> But it seems like this issue could happen in any implementation. It won't happen only
>>>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>>>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>>>> Otherwise, if extra pages allocation is allowed then we can't really do something
>>>> with this issue.
>>> But that's why I brought this up: You simply have to. Or, as indicated, the
>>> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
>> Why it isn't XSA for other architectures? At least, Arm then should have such
>> XSA.
> Does Arm use a radix tree for storing types? It uses one for mem-access, but
> it's not clear to me whether that's actually a supported feature.
>
>> I don't understand why x86 won't have the same issue. Memory is the limited
>> and shared resource, so if one of the domain will use to much memory then it could
>> happen that other domains won't have enough memory for its purpose...
> The question is whether allocations are bounded. With this use of a radix tree,
> you give domains a way to have Xen allocate pretty much arbitrary amounts of
> memory to populate that tree. That unbounded-ness is the problem, not memory
> allocations in general.

Isn't radix tree key bounded to an amount of GFNs given for a domain? We can't have
more keys then a max GFN number for a domain. So a potential amount of necessary memory
for radix tree is also bounded to an amount of GFNs.

Anyway, IIUC I just can't use radix tree for p2m types at all, right?
If yes, does it make sense to borrow 2 bits from struct page_info->type_info as now it
is used 9-bits for count of a frame?
So we will 7-bit reference counter, 2 bits for p2m types in type_info + 2 bits in PTE
what in general will give us 16 p2m types.

~ Oleksii

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 3 months, 1 week ago

On 18.07.2025 11:52, Oleksii Kurochko wrote:
> 
> On 7/17/25 12:25 PM, Jan Beulich wrote:
>> On 17.07.2025 10:56, Oleksii Kurochko wrote:
>>> On 7/16/25 6:18 PM, Jan Beulich wrote:
>>>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>>>          return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>>>      }
>>>>>>>>>      
>>>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>>>> See comments on the earlier patch regarding naming.
>>>>>>>>
>>>>>>>>> +{
>>>>>>>>> +    int rc;
>>>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>>>>> I think I don't understand what is an issue. Could you please provide
>>>>>>> some extra details?
>>>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>>>> of the function, making it possible to use it here?
>>>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>>>> implementation.
>>>> Isn't that on overly severe limitation?
>>> I wouldn't say that it's a severe limitation, as it's just a matter of how
>>> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
>>> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
>>> will likely remain unchanged.
>>>
>>> What I meant in my reply is that, for the current state and current limitations,
>>> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
>>> see the value in, or the need for, non-1:1 mapped domains—it's just that this
>>> limitation simplifies development at the current stage of the RISC-V port.
>> Simplification is fine in some cases, but not supporting the "normal" way of
>> domain construction looks like a pretty odd restriction. I'm also curious
>> how you envision to implement mfn_to_gfn() then, suitable for generic use like
>> the one here. Imo, current limitation or not, you simply want to avoid use of
>> that function outside of the special gnttab case.
>>
>>>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>>>> making Xen insert very many entries?
>>>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>>>> Which would require these allocations to come from that pool.
>>>>> Yes, and it is true only for non-hardware domains with the current implementation.
>>>> ???
>>> I meant that pool is used now only for non-hardware domains at the moment.
>> And how does this matter here? The memory required for the radix tree doesn't
>> come from that pool anyway.
> 
> I thought that is possible to do that somehow, but looking at a code of
> radix-tree.c it seems like the only one way to allocate memroy for the radix
> tree isradix_tree_node_alloc() -> xzalloc(struct rcu_node).
> 
> Then it is needed to introduce radix_tree_node_allocate(domain)

That would be a possibility, but you may have seen that less than half a
year ago we got rid of something along these lines. So it would require
some pretty good justification to re-introduce.

> or radix tree
> can't be used at all for mentioned in the previous replies security reason, no?

(Very) careful use may still be possible. But the downside of using this
(potentially long lookup times) would always remain.

>>>>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>>>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>>>>> impacting just itself may be tolerable. But a domain affecting other domains
>>>>>> isn't.
>>>>> But it seems like this issue could happen in any implementation. It won't happen only
>>>>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>>>>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>>>>> Otherwise, if extra pages allocation is allowed then we can't really do something
>>>>> with this issue.
>>>> But that's why I brought this up: You simply have to. Or, as indicated, the
>>>> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
>>> Why it isn't XSA for other architectures? At least, Arm then should have such
>>> XSA.
>> Does Arm use a radix tree for storing types? It uses one for mem-access, but
>> it's not clear to me whether that's actually a supported feature.
>>
>>> I don't understand why x86 won't have the same issue. Memory is the limited
>>> and shared resource, so if one of the domain will use to much memory then it could
>>> happen that other domains won't have enough memory for its purpose...
>> The question is whether allocations are bounded. With this use of a radix tree,
>> you give domains a way to have Xen allocate pretty much arbitrary amounts of
>> memory to populate that tree. That unbounded-ness is the problem, not memory
>> allocations in general.
> 
> Isn't radix tree key bounded to an amount of GFNs given for a domain? We can't have
> more keys then a max GFN number for a domain. So a potential amount of necessary memory
> for radix tree is also bounded to an amount of GFNs.

To some degree yes, hence why I said "pretty much arbitrary amounts".
But recall that "amount of GFNs" is a fuzzy term; I think you mean to
use it to describe the amount of memory pages given to the guest. GFNs
can be used for other purposes, though. Guests could e.g. grant
themselves access to their own memory, then map those grants at
otherwise unused GFNs.

> Anyway, IIUC I just can't use radix tree for p2m types at all, right?
> If yes, does it make sense to borrow 2 bits from struct page_info->type_info as now it
> is used 9-bits for count of a frame?

struct page_info describes MFNs, when you want to describe GFNs. As you
mentioned earlier, multiple GFNs can in principle map to the same MFN.
You would force them to all have the same properties, which would be in
direct conflict with e.g. the grant P2M types.

Just to mention one possible alternative to using radix trees: You could
maintain a 2nd set of intermediate "page tables", just that leaf entries
would hold meta data for the respective GFN. The memory for those "page
tables" could come from the normal P2M pool (and allocation would thus
only consume domain-specific resources). Of course in any model like
this the question of lookup times (as mentioned above) would remain.

Jan

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 3 months, 1 week ago

On 7/21/25 2:18 PM, Jan Beulich wrote:
> On 18.07.2025 11:52, Oleksii Kurochko wrote:
>> On 7/17/25 12:25 PM, Jan Beulich wrote:
>>> On 17.07.2025 10:56, Oleksii Kurochko wrote:
>>>> On 7/16/25 6:18 PM, Jan Beulich wrote:
>>>>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>>>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>>>>           return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>>>>       }
>>>>>>>>>>       
>>>>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>>>>> See comments on the earlier patch regarding naming.
>>>>>>>>>
>>>>>>>>>> +{
>>>>>>>>>> +    int rc;
>>>>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>>>>>> I think I don't understand what is an issue. Could you please provide
>>>>>>>> some extra details?
>>>>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>>>>> of the function, making it possible to use it here?
>>>>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>>>>> implementation.
>>>>> Isn't that on overly severe limitation?
>>>> I wouldn't say that it's a severe limitation, as it's just a matter of how
>>>> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
>>>> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
>>>> will likely remain unchanged.
>>>>
>>>> What I meant in my reply is that, for the current state and current limitations,
>>>> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
>>>> see the value in, or the need for, non-1:1 mapped domains—it's just that this
>>>> limitation simplifies development at the current stage of the RISC-V port.
>>> Simplification is fine in some cases, but not supporting the "normal" way of
>>> domain construction looks like a pretty odd restriction. I'm also curious
>>> how you envision to implement mfn_to_gfn() then, suitable for generic use like
>>> the one here. Imo, current limitation or not, you simply want to avoid use of
>>> that function outside of the special gnttab case.
>>>
>>>>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>>>>> making Xen insert very many entries?
>>>>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>>>>> Which would require these allocations to come from that pool.
>>>>>> Yes, and it is true only for non-hardware domains with the current implementation.
>>>>> ???
>>>> I meant that pool is used now only for non-hardware domains at the moment.
>>> And how does this matter here? The memory required for the radix tree doesn't
>>> come from that pool anyway.
>> I thought that is possible to do that somehow, but looking at a code of
>> radix-tree.c it seems like the only one way to allocate memroy for the radix
>> tree isradix_tree_node_alloc() -> xzalloc(struct rcu_node).
>>
>> Then it is needed to introduce radix_tree_node_allocate(domain)
> That would be a possibility, but you may have seen that less than half a
> year ago we got rid of something along these lines. So it would require
> some pretty good justification to re-introduce.
>
>> or radix tree
>> can't be used at all for mentioned in the previous replies security reason, no?
> (Very) careful use may still be possible. But the downside of using this
> (potentially long lookup times) would always remain.

Could you please clarify what do you mean here by "(Very) careful"?
I thought about an introduction of an amount of possible keys in radix tree and if this amount
is 0 then stop domain. And it is also unclear what should be a value for this amount.
Probably, you have better idea.

But generally your idea below ...

>
>>>>>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>>>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>>>>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>>>>>> impacting just itself may be tolerable. But a domain affecting other domains
>>>>>>> isn't.
>>>>>> But it seems like this issue could happen in any implementation. It won't happen only
>>>>>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>>>>>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>>>>>> Otherwise, if extra pages allocation is allowed then we can't really do something
>>>>>> with this issue.
>>>>> But that's why I brought this up: You simply have to. Or, as indicated, the
>>>>> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
>>>> Why it isn't XSA for other architectures? At least, Arm then should have such
>>>> XSA.
>>> Does Arm use a radix tree for storing types? It uses one for mem-access, but
>>> it's not clear to me whether that's actually a supported feature.
>>>
>>>> I don't understand why x86 won't have the same issue. Memory is the limited
>>>> and shared resource, so if one of the domain will use to much memory then it could
>>>> happen that other domains won't have enough memory for its purpose...
>>> The question is whether allocations are bounded. With this use of a radix tree,
>>> you give domains a way to have Xen allocate pretty much arbitrary amounts of
>>> memory to populate that tree. That unbounded-ness is the problem, not memory
>>> allocations in general.
>> Isn't radix tree key bounded to an amount of GFNs given for a domain? We can't have
>> more keys then a max GFN number for a domain. So a potential amount of necessary memory
>> for radix tree is also bounded to an amount of GFNs.
> To some degree yes, hence why I said "pretty much arbitrary amounts".
> But recall that "amount of GFNs" is a fuzzy term; I think you mean to
> use it to describe the amount of memory pages given to the guest. GFNs
> can be used for other purposes, though. Guests could e.g. grant
> themselves access to their own memory, then map those grants at
> otherwise unused GFNs.
>
>> Anyway, IIUC I just can't use radix tree for p2m types at all, right?
>> If yes, does it make sense to borrow 2 bits from struct page_info->type_info as now it
>> is used 9-bits for count of a frame?
> struct page_info describes MFNs, when you want to describe GFNs. As you
> mentioned earlier, multiple GFNs can in principle map to the same MFN.
> You would force them to all have the same properties, which would be in
> direct conflict with e.g. the grant P2M types.
>
> Just to mention one possible alternative to using radix trees: You could
> maintain a 2nd set of intermediate "page tables", just that leaf entries
> would hold meta data for the respective GFN. The memory for those "page
> tables" could come from the normal P2M pool (and allocation would thus
> only consume domain-specific resources). Of course in any model like
> this the question of lookup times (as mentioned above) would remain.

...looks like an optimal option.

The only thing I worry about is that it will require some code duplication
(I will think how to re-use the current one code), as for example, when
setting/getting metadata, TLB flushing isn’t needed at all as we aren't
working with with real P2M page tables.

Agree that lookup won't be the best one, but nothing can be done with
such models.

~ Oleksii

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 3 months, 1 week ago

On 22.07.2025 12:41, Oleksii Kurochko wrote:
> On 7/21/25 2:18 PM, Jan Beulich wrote:
>> On 18.07.2025 11:52, Oleksii Kurochko wrote:
>>> On 7/17/25 12:25 PM, Jan Beulich wrote:
>>>> On 17.07.2025 10:56, Oleksii Kurochko wrote:
>>>>> On 7/16/25 6:18 PM, Jan Beulich wrote:
>>>>>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>>>>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>>>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>>>>>           return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>>>>>       }
>>>>>>>>>>>       
>>>>>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>>>>>> See comments on the earlier patch regarding naming.
>>>>>>>>>>
>>>>>>>>>>> +{
>>>>>>>>>>> +    int rc;
>>>>>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>>>>>>> I think I don't understand what is an issue. Could you please provide
>>>>>>>>> some extra details?
>>>>>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>>>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>>>>>> of the function, making it possible to use it here?
>>>>>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>>>>>> implementation.
>>>>>> Isn't that on overly severe limitation?
>>>>> I wouldn't say that it's a severe limitation, as it's just a matter of how
>>>>> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
>>>>> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
>>>>> will likely remain unchanged.
>>>>>
>>>>> What I meant in my reply is that, for the current state and current limitations,
>>>>> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
>>>>> see the value in, or the need for, non-1:1 mapped domains—it's just that this
>>>>> limitation simplifies development at the current stage of the RISC-V port.
>>>> Simplification is fine in some cases, but not supporting the "normal" way of
>>>> domain construction looks like a pretty odd restriction. I'm also curious
>>>> how you envision to implement mfn_to_gfn() then, suitable for generic use like
>>>> the one here. Imo, current limitation or not, you simply want to avoid use of
>>>> that function outside of the special gnttab case.
>>>>
>>>>>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>>>>>> making Xen insert very many entries?
>>>>>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>>>>>> Which would require these allocations to come from that pool.
>>>>>>> Yes, and it is true only for non-hardware domains with the current implementation.
>>>>>> ???
>>>>> I meant that pool is used now only for non-hardware domains at the moment.
>>>> And how does this matter here? The memory required for the radix tree doesn't
>>>> come from that pool anyway.
>>> I thought that is possible to do that somehow, but looking at a code of
>>> radix-tree.c it seems like the only one way to allocate memroy for the radix
>>> tree isradix_tree_node_alloc() -> xzalloc(struct rcu_node).
>>>
>>> Then it is needed to introduce radix_tree_node_allocate(domain)
>> That would be a possibility, but you may have seen that less than half a
>> year ago we got rid of something along these lines. So it would require
>> some pretty good justification to re-introduce.
>>
>>> or radix tree
>>> can't be used at all for mentioned in the previous replies security reason, no?
>> (Very) careful use may still be possible. But the downside of using this
>> (potentially long lookup times) would always remain.
> 
> Could you please clarify what do you mean here by "(Very) careful"?
> I thought about an introduction of an amount of possible keys in radix tree and if this amount
> is 0 then stop domain. And it is also unclear what should be a value for this amount.
> Probably, you have better idea.

I had no particular idea in mind. I said "(very) careful" merely to clarify
that whatever model is chosen, it would need to satisfy certain needs.

Jan

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 3 months, 1 week ago

On 7/22/25 12:41 PM, Oleksii Kurochko wrote:
>
>
> On 7/21/25 2:18 PM, Jan Beulich wrote:
>> On 18.07.2025 11:52, Oleksii Kurochko wrote:
>>> On 7/17/25 12:25 PM, Jan Beulich wrote:
>>>> On 17.07.2025 10:56, Oleksii Kurochko wrote:
>>>>> On 7/16/25 6:18 PM, Jan Beulich wrote:
>>>>>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>>>>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>>>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>>>>>           return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>>>>>       }
>>>>>>>>>>>       
>>>>>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>>>>>> See comments on the earlier patch regarding naming.
>>>>>>>>>>
>>>>>>>>>>> +{
>>>>>>>>>>> +    int rc;
>>>>>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>>>>>>> I think I don't understand what is an issue. Could you please provide
>>>>>>>>> some extra details?
>>>>>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>>>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>>>>>> of the function, making it possible to use it here?
>>>>>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>>>>>> implementation.
>>>>>> Isn't that on overly severe limitation?
>>>>> I wouldn't say that it's a severe limitation, as it's just a matter of how
>>>>> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
>>>>> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
>>>>> will likely remain unchanged.
>>>>>
>>>>> What I meant in my reply is that, for the current state and current limitations,
>>>>> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
>>>>> see the value in, or the need for, non-1:1 mapped domains—it's just that this
>>>>> limitation simplifies development at the current stage of the RISC-V port.
>>>> Simplification is fine in some cases, but not supporting the "normal" way of
>>>> domain construction looks like a pretty odd restriction. I'm also curious
>>>> how you envision to implement mfn_to_gfn() then, suitable for generic use like
>>>> the one here. Imo, current limitation or not, you simply want to avoid use of
>>>> that function outside of the special gnttab case.
>>>>
>>>>>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>>>>>> making Xen insert very many entries?
>>>>>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>>>>>> Which would require these allocations to come from that pool.
>>>>>>> Yes, and it is true only for non-hardware domains with the current implementation.
>>>>>> ???
>>>>> I meant that pool is used now only for non-hardware domains at the moment.
>>>> And how does this matter here? The memory required for the radix tree doesn't
>>>> come from that pool anyway.
>>> I thought that is possible to do that somehow, but looking at a code of
>>> radix-tree.c it seems like the only one way to allocate memroy for the radix
>>> tree isradix_tree_node_alloc() -> xzalloc(struct rcu_node).
>>>
>>> Then it is needed to introduce radix_tree_node_allocate(domain)
>> That would be a possibility, but you may have seen that less than half a
>> year ago we got rid of something along these lines. So it would require
>> some pretty good justification to re-introduce.
>>
>>> or radix tree
>>> can't be used at all for mentioned in the previous replies security reason, no?
>> (Very) careful use may still be possible. But the downside of using this
>> (potentially long lookup times) would always remain.
> Could you please clarify what do you mean here by "(Very) careful"?
> I thought about an introduction of an amount of possible keys in radix tree and if this amount
> is 0 then stop domain. And it is also unclear what should be a value for this amount.
> Probably, you have better idea.
>
> But generally your idea below ...
>>>>>>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>>>>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>>>>>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>>>>>>> impacting just itself may be tolerable. But a domain affecting other domains
>>>>>>>> isn't.
>>>>>>> But it seems like this issue could happen in any implementation. It won't happen only
>>>>>>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>>>>>>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>>>>>>> Otherwise, if extra pages allocation is allowed then we can't really do something
>>>>>>> with this issue.
>>>>>> But that's why I brought this up: You simply have to. Or, as indicated, the
>>>>>> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
>>>>> Why it isn't XSA for other architectures? At least, Arm then should have such
>>>>> XSA.
>>>> Does Arm use a radix tree for storing types? It uses one for mem-access, but
>>>> it's not clear to me whether that's actually a supported feature.
>>>>
>>>>> I don't understand why x86 won't have the same issue. Memory is the limited
>>>>> and shared resource, so if one of the domain will use to much memory then it could
>>>>> happen that other domains won't have enough memory for its purpose...
>>>> The question is whether allocations are bounded. With this use of a radix tree,
>>>> you give domains a way to have Xen allocate pretty much arbitrary amounts of
>>>> memory to populate that tree. That unbounded-ness is the problem, not memory
>>>> allocations in general.
>>> Isn't radix tree key bounded to an amount of GFNs given for a domain? We can't have
>>> more keys then a max GFN number for a domain. So a potential amount of necessary memory
>>> for radix tree is also bounded to an amount of GFNs.
>> To some degree yes, hence why I said "pretty much arbitrary amounts".
>> But recall that "amount of GFNs" is a fuzzy term; I think you mean to
>> use it to describe the amount of memory pages given to the guest. GFNs
>> can be used for other purposes, though. Guests could e.g. grant
>> themselves access to their own memory, then map those grants at
>> otherwise unused GFNs.
>>
>>> Anyway, IIUC I just can't use radix tree for p2m types at all, right?
>>> If yes, does it make sense to borrow 2 bits from struct page_info->type_info as now it
>>> is used 9-bits for count of a frame?
>> struct page_info describes MFNs, when you want to describe GFNs. As you
>> mentioned earlier, multiple GFNs can in principle map to the same MFN.
>> You would force them to all have the same properties, which would be in
>> direct conflict with e.g. the grant P2M types.
>>
>> Just to mention one possible alternative to using radix trees: You could
>> maintain a 2nd set of intermediate "page tables", just that leaf entries
>> would hold meta data for the respective GFN. The memory for those "page
>> tables" could come from the normal P2M pool (and allocation would thus
>> only consume domain-specific resources). Of course in any model like
>> this the question of lookup times (as mentioned above) would remain.
> ...looks like an optimal option.
>
> The only thing I worry about is that it will require some code duplication
> (I will think how to re-use the current one code), as for example, when
> setting/getting metadata, TLB flushing isn’t needed at all as we aren't
> working with with real P2M page tables.
> Agree that lookup won't be the best one, but nothing can be done with
> such models.

Probably, instead of having a second set of intermediate "page tables",
we could just allocate two consecutive pages within the real P2M page
tables for the intermediate page table. The first page would serve as
the actual page table to which the intermediate page table points,
and the second page would store metadata for each entry of the page
table that the intermediate page table references.

As we are supporting only 1gb, 2mb and 4kb mappings we could do a little
optimization and start allocate these consecutive pages only for PT levels
which corresponds to 1gb, 2mb, 4kb mappings.

Does it make sense?

~ Oleksii

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 3 months, 1 week ago

On 22.07.2025 13:34, Oleksii Kurochko wrote:
> 
> On 7/22/25 12:41 PM, Oleksii Kurochko wrote:
>>
>>
>> On 7/21/25 2:18 PM, Jan Beulich wrote:
>>> On 18.07.2025 11:52, Oleksii Kurochko wrote:
>>>> On 7/17/25 12:25 PM, Jan Beulich wrote:
>>>>> On 17.07.2025 10:56, Oleksii Kurochko wrote:
>>>>>> On 7/16/25 6:18 PM, Jan Beulich wrote:
>>>>>>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>>>>>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>>>>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>>>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>>>>>>           return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>>>>>>       }
>>>>>>>>>>>>       
>>>>>>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>>>>>>> See comments on the earlier patch regarding naming.
>>>>>>>>>>>
>>>>>>>>>>>> +{
>>>>>>>>>>>> +    int rc;
>>>>>>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>>>>>>>> I think I don't understand what is an issue. Could you please provide
>>>>>>>>>> some extra details?
>>>>>>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>>>>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>>>>>>> of the function, making it possible to use it here?
>>>>>>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>>>>>>> implementation.
>>>>>>> Isn't that on overly severe limitation?
>>>>>> I wouldn't say that it's a severe limitation, as it's just a matter of how
>>>>>> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
>>>>>> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
>>>>>> will likely remain unchanged.
>>>>>>
>>>>>> What I meant in my reply is that, for the current state and current limitations,
>>>>>> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
>>>>>> see the value in, or the need for, non-1:1 mapped domains—it's just that this
>>>>>> limitation simplifies development at the current stage of the RISC-V port.
>>>>> Simplification is fine in some cases, but not supporting the "normal" way of
>>>>> domain construction looks like a pretty odd restriction. I'm also curious
>>>>> how you envision to implement mfn_to_gfn() then, suitable for generic use like
>>>>> the one here. Imo, current limitation or not, you simply want to avoid use of
>>>>> that function outside of the special gnttab case.
>>>>>
>>>>>>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>>>>>>> making Xen insert very many entries?
>>>>>>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>>>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>>>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>>>>>>> Which would require these allocations to come from that pool.
>>>>>>>> Yes, and it is true only for non-hardware domains with the current implementation.
>>>>>>> ???
>>>>>> I meant that pool is used now only for non-hardware domains at the moment.
>>>>> And how does this matter here? The memory required for the radix tree doesn't
>>>>> come from that pool anyway.
>>>> I thought that is possible to do that somehow, but looking at a code of
>>>> radix-tree.c it seems like the only one way to allocate memroy for the radix
>>>> tree isradix_tree_node_alloc() -> xzalloc(struct rcu_node).
>>>>
>>>> Then it is needed to introduce radix_tree_node_allocate(domain)
>>> That would be a possibility, but you may have seen that less than half a
>>> year ago we got rid of something along these lines. So it would require
>>> some pretty good justification to re-introduce.
>>>
>>>> or radix tree
>>>> can't be used at all for mentioned in the previous replies security reason, no?
>>> (Very) careful use may still be possible. But the downside of using this
>>> (potentially long lookup times) would always remain.
>> Could you please clarify what do you mean here by "(Very) careful"?
>> I thought about an introduction of an amount of possible keys in radix tree and if this amount
>> is 0 then stop domain. And it is also unclear what should be a value for this amount.
>> Probably, you have better idea.
>>
>> But generally your idea below ...
>>>>>>>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>>>>>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>>>>>>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>>>>>>>> impacting just itself may be tolerable. But a domain affecting other domains
>>>>>>>>> isn't.
>>>>>>>> But it seems like this issue could happen in any implementation. It won't happen only
>>>>>>>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>>>>>>>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>>>>>>>> Otherwise, if extra pages allocation is allowed then we can't really do something
>>>>>>>> with this issue.
>>>>>>> But that's why I brought this up: You simply have to. Or, as indicated, the
>>>>>>> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
>>>>>> Why it isn't XSA for other architectures? At least, Arm then should have such
>>>>>> XSA.
>>>>> Does Arm use a radix tree for storing types? It uses one for mem-access, but
>>>>> it's not clear to me whether that's actually a supported feature.
>>>>>
>>>>>> I don't understand why x86 won't have the same issue. Memory is the limited
>>>>>> and shared resource, so if one of the domain will use to much memory then it could
>>>>>> happen that other domains won't have enough memory for its purpose...
>>>>> The question is whether allocations are bounded. With this use of a radix tree,
>>>>> you give domains a way to have Xen allocate pretty much arbitrary amounts of
>>>>> memory to populate that tree. That unbounded-ness is the problem, not memory
>>>>> allocations in general.
>>>> Isn't radix tree key bounded to an amount of GFNs given for a domain? We can't have
>>>> more keys then a max GFN number for a domain. So a potential amount of necessary memory
>>>> for radix tree is also bounded to an amount of GFNs.
>>> To some degree yes, hence why I said "pretty much arbitrary amounts".
>>> But recall that "amount of GFNs" is a fuzzy term; I think you mean to
>>> use it to describe the amount of memory pages given to the guest. GFNs
>>> can be used for other purposes, though. Guests could e.g. grant
>>> themselves access to their own memory, then map those grants at
>>> otherwise unused GFNs.
>>>
>>>> Anyway, IIUC I just can't use radix tree for p2m types at all, right?
>>>> If yes, does it make sense to borrow 2 bits from struct page_info->type_info as now it
>>>> is used 9-bits for count of a frame?
>>> struct page_info describes MFNs, when you want to describe GFNs. As you
>>> mentioned earlier, multiple GFNs can in principle map to the same MFN.
>>> You would force them to all have the same properties, which would be in
>>> direct conflict with e.g. the grant P2M types.
>>>
>>> Just to mention one possible alternative to using radix trees: You could
>>> maintain a 2nd set of intermediate "page tables", just that leaf entries
>>> would hold meta data for the respective GFN. The memory for those "page
>>> tables" could come from the normal P2M pool (and allocation would thus
>>> only consume domain-specific resources). Of course in any model like
>>> this the question of lookup times (as mentioned above) would remain.
>> ...looks like an optimal option.
>>
>> The only thing I worry about is that it will require some code duplication
>> (I will think how to re-use the current one code), as for example, when
>> setting/getting metadata, TLB flushing isn’t needed at all as we aren't
>> working with with real P2M page tables.
>> Agree that lookup won't be the best one, but nothing can be done with
>> such models.
> 
> Probably, instead of having a second set of intermediate "page tables",
> we could just allocate two consecutive pages within the real P2M page
> tables for the intermediate page table. The first page would serve as
> the actual page table to which the intermediate page table points,
> and the second page would store metadata for each entry of the page
> table that the intermediate page table references.
> 
> As we are supporting only 1gb, 2mb and 4kb mappings we could do a little
> optimization and start allocate these consecutive pages only for PT levels
> which corresponds to 1gb, 2mb, 4kb mappings.
> 
> Does it make sense?

I was indeed entertaining this idea, but I couldn't conclude for myself if
that would indeed be without any rough edges. Hence I didn't want to
suggest such. For example, the need to have adjacent pairs of pages could
result in a higher rate of allocation failures (while populating or
re-sizing the P2M pool). This would be possible to avoid by still using
entirely separate pages, and then merely linking them together via some
unused struct page_info fields (the "normal" linking fields can't be used,
afaict).

Jan

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 3 months, 1 week ago

On 7/22/25 2:00 PM, Jan Beulich wrote:
> On 22.07.2025 13:34, Oleksii Kurochko wrote:
>> On 7/22/25 12:41 PM, Oleksii Kurochko wrote:
>>>
>>> On 7/21/25 2:18 PM, Jan Beulich wrote:
>>>> On 18.07.2025 11:52, Oleksii Kurochko wrote:
>>>>> On 7/17/25 12:25 PM, Jan Beulich wrote:
>>>>>> On 17.07.2025 10:56, Oleksii Kurochko wrote:
>>>>>>> On 7/16/25 6:18 PM, Jan Beulich wrote:
>>>>>>>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>>>>>>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>>>>>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>>>>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>>>>>>>            return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>>>>>>>        }
>>>>>>>>>>>>>        
>>>>>>>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>>>>>>>> See comments on the earlier patch regarding naming.
>>>>>>>>>>>>
>>>>>>>>>>>>> +{
>>>>>>>>>>>>> +    int rc;
>>>>>>>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>>>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>>>>>>>>> I think I don't understand what is an issue. Could you please provide
>>>>>>>>>>> some extra details?
>>>>>>>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>>>>>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>>>>>>>> of the function, making it possible to use it here?
>>>>>>>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>>>>>>>> implementation.
>>>>>>>> Isn't that on overly severe limitation?
>>>>>>> I wouldn't say that it's a severe limitation, as it's just a matter of how
>>>>>>> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
>>>>>>> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
>>>>>>> will likely remain unchanged.
>>>>>>>
>>>>>>> What I meant in my reply is that, for the current state and current limitations,
>>>>>>> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
>>>>>>> see the value in, or the need for, non-1:1 mapped domains—it's just that this
>>>>>>> limitation simplifies development at the current stage of the RISC-V port.
>>>>>> Simplification is fine in some cases, but not supporting the "normal" way of
>>>>>> domain construction looks like a pretty odd restriction. I'm also curious
>>>>>> how you envision to implement mfn_to_gfn() then, suitable for generic use like
>>>>>> the one here. Imo, current limitation or not, you simply want to avoid use of
>>>>>> that function outside of the special gnttab case.
>>>>>>
>>>>>>>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>>>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>>>>>>>> making Xen insert very many entries?
>>>>>>>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>>>>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>>>>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>>>>>>>> Which would require these allocations to come from that pool.
>>>>>>>>> Yes, and it is true only for non-hardware domains with the current implementation.
>>>>>>>> ???
>>>>>>> I meant that pool is used now only for non-hardware domains at the moment.
>>>>>> And how does this matter here? The memory required for the radix tree doesn't
>>>>>> come from that pool anyway.
>>>>> I thought that is possible to do that somehow, but looking at a code of
>>>>> radix-tree.c it seems like the only one way to allocate memroy for the radix
>>>>> tree isradix_tree_node_alloc() -> xzalloc(struct rcu_node).
>>>>>
>>>>> Then it is needed to introduce radix_tree_node_allocate(domain)
>>>> That would be a possibility, but you may have seen that less than half a
>>>> year ago we got rid of something along these lines. So it would require
>>>> some pretty good justification to re-introduce.
>>>>
>>>>> or radix tree
>>>>> can't be used at all for mentioned in the previous replies security reason, no?
>>>> (Very) careful use may still be possible. But the downside of using this
>>>> (potentially long lookup times) would always remain.
>>> Could you please clarify what do you mean here by "(Very) careful"?
>>> I thought about an introduction of an amount of possible keys in radix tree and if this amount
>>> is 0 then stop domain. And it is also unclear what should be a value for this amount.
>>> Probably, you have better idea.
>>>
>>> But generally your idea below ...
>>>>>>>>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>>>>>>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>>>>>>>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>>>>>>>>> impacting just itself may be tolerable. But a domain affecting other domains
>>>>>>>>>> isn't.
>>>>>>>>> But it seems like this issue could happen in any implementation. It won't happen only
>>>>>>>>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>>>>>>>>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>>>>>>>>> Otherwise, if extra pages allocation is allowed then we can't really do something
>>>>>>>>> with this issue.
>>>>>>>> But that's why I brought this up: You simply have to. Or, as indicated, the
>>>>>>>> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
>>>>>>> Why it isn't XSA for other architectures? At least, Arm then should have such
>>>>>>> XSA.
>>>>>> Does Arm use a radix tree for storing types? It uses one for mem-access, but
>>>>>> it's not clear to me whether that's actually a supported feature.
>>>>>>
>>>>>>> I don't understand why x86 won't have the same issue. Memory is the limited
>>>>>>> and shared resource, so if one of the domain will use to much memory then it could
>>>>>>> happen that other domains won't have enough memory for its purpose...
>>>>>> The question is whether allocations are bounded. With this use of a radix tree,
>>>>>> you give domains a way to have Xen allocate pretty much arbitrary amounts of
>>>>>> memory to populate that tree. That unbounded-ness is the problem, not memory
>>>>>> allocations in general.
>>>>> Isn't radix tree key bounded to an amount of GFNs given for a domain? We can't have
>>>>> more keys then a max GFN number for a domain. So a potential amount of necessary memory
>>>>> for radix tree is also bounded to an amount of GFNs.
>>>> To some degree yes, hence why I said "pretty much arbitrary amounts".
>>>> But recall that "amount of GFNs" is a fuzzy term; I think you mean to
>>>> use it to describe the amount of memory pages given to the guest. GFNs
>>>> can be used for other purposes, though. Guests could e.g. grant
>>>> themselves access to their own memory, then map those grants at
>>>> otherwise unused GFNs.
>>>>
>>>>> Anyway, IIUC I just can't use radix tree for p2m types at all, right?
>>>>> If yes, does it make sense to borrow 2 bits from struct page_info->type_info as now it
>>>>> is used 9-bits for count of a frame?
>>>> struct page_info describes MFNs, when you want to describe GFNs. As you
>>>> mentioned earlier, multiple GFNs can in principle map to the same MFN.
>>>> You would force them to all have the same properties, which would be in
>>>> direct conflict with e.g. the grant P2M types.
>>>>
>>>> Just to mention one possible alternative to using radix trees: You could
>>>> maintain a 2nd set of intermediate "page tables", just that leaf entries
>>>> would hold meta data for the respective GFN. The memory for those "page
>>>> tables" could come from the normal P2M pool (and allocation would thus
>>>> only consume domain-specific resources). Of course in any model like
>>>> this the question of lookup times (as mentioned above) would remain.
>>> ...looks like an optimal option.
>>>
>>> The only thing I worry about is that it will require some code duplication
>>> (I will think how to re-use the current one code), as for example, when
>>> setting/getting metadata, TLB flushing isn’t needed at all as we aren't
>>> working with with real P2M page tables.
>>> Agree that lookup won't be the best one, but nothing can be done with
>>> such models.
>> Probably, instead of having a second set of intermediate "page tables",
>> we could just allocate two consecutive pages within the real P2M page
>> tables for the intermediate page table. The first page would serve as
>> the actual page table to which the intermediate page table points,
>> and the second page would store metadata for each entry of the page
>> table that the intermediate page table references.
>>
>> As we are supporting only 1gb, 2mb and 4kb mappings we could do a little
>> optimization and start allocate these consecutive pages only for PT levels
>> which corresponds to 1gb, 2mb, 4kb mappings.
>>
>> Does it make sense?
> I was indeed entertaining this idea, but I couldn't conclude for myself if
> that would indeed be without any rough edges. Hence I didn't want to
> suggest such. For example, the need to have adjacent pairs of pages could
> result in a higher rate of allocation failures (while populating or
> re-sizing the P2M pool). This would be possible to avoid by still using
> entirely separate pages, and then merely linking them together via some
> unused struct page_info fields (the "normal" linking fields can't be used,
> afaict).

I think that all the fields are used, so it will be needed to introduce new
"struct page_list_entry metadata_list;".

Can't we introduce new PGT_METADATA type and then add metadata page to
struct page_info->list and when a metadata will be needed just iterate through
page_info->list and find a page with PGT_METADATA type?

~ Oleksii

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 3 months, 1 week ago

On 22.07.2025 16:25, Oleksii Kurochko wrote:
> On 7/22/25 2:00 PM, Jan Beulich wrote:
>> On 22.07.2025 13:34, Oleksii Kurochko wrote:
>>> On 7/22/25 12:41 PM, Oleksii Kurochko wrote:
>>>> On 7/21/25 2:18 PM, Jan Beulich wrote:
>>>>> On 18.07.2025 11:52, Oleksii Kurochko wrote:
>>>>>> On 7/17/25 12:25 PM, Jan Beulich wrote:
>>>>>>> On 17.07.2025 10:56, Oleksii Kurochko wrote:
>>>>>>>> On 7/16/25 6:18 PM, Jan Beulich wrote:
>>>>>>>>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>>>>>>>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>>>>>>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>>>>>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>>>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>>>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>>>>>>>>            return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>>>>>>>>        }
>>>>>>>>>>>>>>        
>>>>>>>>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>>>>>>>>> See comments on the earlier patch regarding naming.
>>>>>>>>>>>>>
>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>> +    int rc;
>>>>>>>>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>>>>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>>>>>>>>>> I think I don't understand what is an issue. Could you please provide
>>>>>>>>>>>> some extra details?
>>>>>>>>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>>>>>>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>>>>>>>>> of the function, making it possible to use it here?
>>>>>>>>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>>>>>>>>> implementation.
>>>>>>>>> Isn't that on overly severe limitation?
>>>>>>>> I wouldn't say that it's a severe limitation, as it's just a matter of how
>>>>>>>> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
>>>>>>>> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
>>>>>>>> will likely remain unchanged.
>>>>>>>>
>>>>>>>> What I meant in my reply is that, for the current state and current limitations,
>>>>>>>> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
>>>>>>>> see the value in, or the need for, non-1:1 mapped domains—it's just that this
>>>>>>>> limitation simplifies development at the current stage of the RISC-V port.
>>>>>>> Simplification is fine in some cases, but not supporting the "normal" way of
>>>>>>> domain construction looks like a pretty odd restriction. I'm also curious
>>>>>>> how you envision to implement mfn_to_gfn() then, suitable for generic use like
>>>>>>> the one here. Imo, current limitation or not, you simply want to avoid use of
>>>>>>> that function outside of the special gnttab case.
>>>>>>>
>>>>>>>>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>>>>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>>>>>>>>> making Xen insert very many entries?
>>>>>>>>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>>>>>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>>>>>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>>>>>>>>> Which would require these allocations to come from that pool.
>>>>>>>>>> Yes, and it is true only for non-hardware domains with the current implementation.
>>>>>>>>> ???
>>>>>>>> I meant that pool is used now only for non-hardware domains at the moment.
>>>>>>> And how does this matter here? The memory required for the radix tree doesn't
>>>>>>> come from that pool anyway.
>>>>>> I thought that is possible to do that somehow, but looking at a code of
>>>>>> radix-tree.c it seems like the only one way to allocate memroy for the radix
>>>>>> tree isradix_tree_node_alloc() -> xzalloc(struct rcu_node).
>>>>>>
>>>>>> Then it is needed to introduce radix_tree_node_allocate(domain)
>>>>> That would be a possibility, but you may have seen that less than half a
>>>>> year ago we got rid of something along these lines. So it would require
>>>>> some pretty good justification to re-introduce.
>>>>>
>>>>>> or radix tree
>>>>>> can't be used at all for mentioned in the previous replies security reason, no?
>>>>> (Very) careful use may still be possible. But the downside of using this
>>>>> (potentially long lookup times) would always remain.
>>>> Could you please clarify what do you mean here by "(Very) careful"?
>>>> I thought about an introduction of an amount of possible keys in radix tree and if this amount
>>>> is 0 then stop domain. And it is also unclear what should be a value for this amount.
>>>> Probably, you have better idea.
>>>>
>>>> But generally your idea below ...
>>>>>>>>>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>>>>>>>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>>>>>>>>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>>>>>>>>>> impacting just itself may be tolerable. But a domain affecting other domains
>>>>>>>>>>> isn't.
>>>>>>>>>> But it seems like this issue could happen in any implementation. It won't happen only
>>>>>>>>>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>>>>>>>>>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>>>>>>>>>> Otherwise, if extra pages allocation is allowed then we can't really do something
>>>>>>>>>> with this issue.
>>>>>>>>> But that's why I brought this up: You simply have to. Or, as indicated, the
>>>>>>>>> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
>>>>>>>> Why it isn't XSA for other architectures? At least, Arm then should have such
>>>>>>>> XSA.
>>>>>>> Does Arm use a radix tree for storing types? It uses one for mem-access, but
>>>>>>> it's not clear to me whether that's actually a supported feature.
>>>>>>>
>>>>>>>> I don't understand why x86 won't have the same issue. Memory is the limited
>>>>>>>> and shared resource, so if one of the domain will use to much memory then it could
>>>>>>>> happen that other domains won't have enough memory for its purpose...
>>>>>>> The question is whether allocations are bounded. With this use of a radix tree,
>>>>>>> you give domains a way to have Xen allocate pretty much arbitrary amounts of
>>>>>>> memory to populate that tree. That unbounded-ness is the problem, not memory
>>>>>>> allocations in general.
>>>>>> Isn't radix tree key bounded to an amount of GFNs given for a domain? We can't have
>>>>>> more keys then a max GFN number for a domain. So a potential amount of necessary memory
>>>>>> for radix tree is also bounded to an amount of GFNs.
>>>>> To some degree yes, hence why I said "pretty much arbitrary amounts".
>>>>> But recall that "amount of GFNs" is a fuzzy term; I think you mean to
>>>>> use it to describe the amount of memory pages given to the guest. GFNs
>>>>> can be used for other purposes, though. Guests could e.g. grant
>>>>> themselves access to their own memory, then map those grants at
>>>>> otherwise unused GFNs.
>>>>>
>>>>>> Anyway, IIUC I just can't use radix tree for p2m types at all, right?
>>>>>> If yes, does it make sense to borrow 2 bits from struct page_info->type_info as now it
>>>>>> is used 9-bits for count of a frame?
>>>>> struct page_info describes MFNs, when you want to describe GFNs. As you
>>>>> mentioned earlier, multiple GFNs can in principle map to the same MFN.
>>>>> You would force them to all have the same properties, which would be in
>>>>> direct conflict with e.g. the grant P2M types.
>>>>>
>>>>> Just to mention one possible alternative to using radix trees: You could
>>>>> maintain a 2nd set of intermediate "page tables", just that leaf entries
>>>>> would hold meta data for the respective GFN. The memory for those "page
>>>>> tables" could come from the normal P2M pool (and allocation would thus
>>>>> only consume domain-specific resources). Of course in any model like
>>>>> this the question of lookup times (as mentioned above) would remain.
>>>> ...looks like an optimal option.
>>>>
>>>> The only thing I worry about is that it will require some code duplication
>>>> (I will think how to re-use the current one code), as for example, when
>>>> setting/getting metadata, TLB flushing isn’t needed at all as we aren't
>>>> working with with real P2M page tables.
>>>> Agree that lookup won't be the best one, but nothing can be done with
>>>> such models.
>>> Probably, instead of having a second set of intermediate "page tables",
>>> we could just allocate two consecutive pages within the real P2M page
>>> tables for the intermediate page table. The first page would serve as
>>> the actual page table to which the intermediate page table points,
>>> and the second page would store metadata for each entry of the page
>>> table that the intermediate page table references.
>>>
>>> As we are supporting only 1gb, 2mb and 4kb mappings we could do a little
>>> optimization and start allocate these consecutive pages only for PT levels
>>> which corresponds to 1gb, 2mb, 4kb mappings.
>>>
>>> Does it make sense?
>> I was indeed entertaining this idea, but I couldn't conclude for myself if
>> that would indeed be without any rough edges. Hence I didn't want to
>> suggest such. For example, the need to have adjacent pairs of pages could
>> result in a higher rate of allocation failures (while populating or
>> re-sizing the P2M pool). This would be possible to avoid by still using
>> entirely separate pages, and then merely linking them together via some
>> unused struct page_info fields (the "normal" linking fields can't be used,
>> afaict).
> 
> I think that all the fields are used, so it will be needed to introduce new
> "struct page_list_entry metadata_list;".

All the fields are used _somewhere_, sure. But once you have allocated a
page (and that page isn't assigned to a domain), you control what the
fields are used for. Or else enlisting pages on private lists wouldn't be
legitimate either.

> Can't we introduce new PGT_METADATA type and then add metadata page to
> struct page_info->list and when a metadata will be needed just iterate through
> page_info->list and find a page with PGT_METADATA type?

I'd be careful with the introduction of new page types. All handling of
page types everywhere in the (affected part of the) code base would then
need auditing.

Jan

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 3 months, 1 week ago

On 7/22/25 4:35 PM, Jan Beulich wrote:
> On 22.07.2025 16:25, Oleksii Kurochko wrote:
>> On 7/22/25 2:00 PM, Jan Beulich wrote:
>>> On 22.07.2025 13:34, Oleksii Kurochko wrote:
>>>> On 7/22/25 12:41 PM, Oleksii Kurochko wrote:
>>>>> On 7/21/25 2:18 PM, Jan Beulich wrote:
>>>>>> On 18.07.2025 11:52, Oleksii Kurochko wrote:
>>>>>>> On 7/17/25 12:25 PM, Jan Beulich wrote:
>>>>>>>> On 17.07.2025 10:56, Oleksii Kurochko wrote:
>>>>>>>>> On 7/16/25 6:18 PM, Jan Beulich wrote:
>>>>>>>>>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>>>>>>>>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>>>>>>>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>>>>>>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>>>>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>>>>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>>>>>>>>>             return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>>>>>>>>>         }
>>>>>>>>>>>>>>>         
>>>>>>>>>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>>>>>>>>>> See comments on the earlier patch regarding naming.
>>>>>>>>>>>>>>
>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>> +    int rc;
>>>>>>>>>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>>>>>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>>>>>>>>>>> I think I don't understand what is an issue. Could you please provide
>>>>>>>>>>>>> some extra details?
>>>>>>>>>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>>>>>>>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>>>>>>>>>> of the function, making it possible to use it here?
>>>>>>>>>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>>>>>>>>>> implementation.
>>>>>>>>>> Isn't that on overly severe limitation?
>>>>>>>>> I wouldn't say that it's a severe limitation, as it's just a matter of how
>>>>>>>>> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
>>>>>>>>> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
>>>>>>>>> will likely remain unchanged.
>>>>>>>>>
>>>>>>>>> What I meant in my reply is that, for the current state and current limitations,
>>>>>>>>> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
>>>>>>>>> see the value in, or the need for, non-1:1 mapped domains—it's just that this
>>>>>>>>> limitation simplifies development at the current stage of the RISC-V port.
>>>>>>>> Simplification is fine in some cases, but not supporting the "normal" way of
>>>>>>>> domain construction looks like a pretty odd restriction. I'm also curious
>>>>>>>> how you envision to implement mfn_to_gfn() then, suitable for generic use like
>>>>>>>> the one here. Imo, current limitation or not, you simply want to avoid use of
>>>>>>>> that function outside of the special gnttab case.
>>>>>>>>
>>>>>>>>>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>>>>>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>>>>>>>>>> making Xen insert very many entries?
>>>>>>>>>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>>>>>>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>>>>>>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>>>>>>>>>> Which would require these allocations to come from that pool.
>>>>>>>>>>> Yes, and it is true only for non-hardware domains with the current implementation.
>>>>>>>>>> ???
>>>>>>>>> I meant that pool is used now only for non-hardware domains at the moment.
>>>>>>>> And how does this matter here? The memory required for the radix tree doesn't
>>>>>>>> come from that pool anyway.
>>>>>>> I thought that is possible to do that somehow, but looking at a code of
>>>>>>> radix-tree.c it seems like the only one way to allocate memroy for the radix
>>>>>>> tree isradix_tree_node_alloc() -> xzalloc(struct rcu_node).
>>>>>>>
>>>>>>> Then it is needed to introduce radix_tree_node_allocate(domain)
>>>>>> That would be a possibility, but you may have seen that less than half a
>>>>>> year ago we got rid of something along these lines. So it would require
>>>>>> some pretty good justification to re-introduce.
>>>>>>
>>>>>>> or radix tree
>>>>>>> can't be used at all for mentioned in the previous replies security reason, no?
>>>>>> (Very) careful use may still be possible. But the downside of using this
>>>>>> (potentially long lookup times) would always remain.
>>>>> Could you please clarify what do you mean here by "(Very) careful"?
>>>>> I thought about an introduction of an amount of possible keys in radix tree and if this amount
>>>>> is 0 then stop domain. And it is also unclear what should be a value for this amount.
>>>>> Probably, you have better idea.
>>>>>
>>>>> But generally your idea below ...
>>>>>>>>>>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>>>>>>>>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>>>>>>>>>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>>>>>>>>>>> impacting just itself may be tolerable. But a domain affecting other domains
>>>>>>>>>>>> isn't.
>>>>>>>>>>> But it seems like this issue could happen in any implementation. It won't happen only
>>>>>>>>>>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>>>>>>>>>>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>>>>>>>>>>> Otherwise, if extra pages allocation is allowed then we can't really do something
>>>>>>>>>>> with this issue.
>>>>>>>>>> But that's why I brought this up: You simply have to. Or, as indicated, the
>>>>>>>>>> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
>>>>>>>>> Why it isn't XSA for other architectures? At least, Arm then should have such
>>>>>>>>> XSA.
>>>>>>>> Does Arm use a radix tree for storing types? It uses one for mem-access, but
>>>>>>>> it's not clear to me whether that's actually a supported feature.
>>>>>>>>
>>>>>>>>> I don't understand why x86 won't have the same issue. Memory is the limited
>>>>>>>>> and shared resource, so if one of the domain will use to much memory then it could
>>>>>>>>> happen that other domains won't have enough memory for its purpose...
>>>>>>>> The question is whether allocations are bounded. With this use of a radix tree,
>>>>>>>> you give domains a way to have Xen allocate pretty much arbitrary amounts of
>>>>>>>> memory to populate that tree. That unbounded-ness is the problem, not memory
>>>>>>>> allocations in general.
>>>>>>> Isn't radix tree key bounded to an amount of GFNs given for a domain? We can't have
>>>>>>> more keys then a max GFN number for a domain. So a potential amount of necessary memory
>>>>>>> for radix tree is also bounded to an amount of GFNs.
>>>>>> To some degree yes, hence why I said "pretty much arbitrary amounts".
>>>>>> But recall that "amount of GFNs" is a fuzzy term; I think you mean to
>>>>>> use it to describe the amount of memory pages given to the guest. GFNs
>>>>>> can be used for other purposes, though. Guests could e.g. grant
>>>>>> themselves access to their own memory, then map those grants at
>>>>>> otherwise unused GFNs.
>>>>>>
>>>>>>> Anyway, IIUC I just can't use radix tree for p2m types at all, right?
>>>>>>> If yes, does it make sense to borrow 2 bits from struct page_info->type_info as now it
>>>>>>> is used 9-bits for count of a frame?
>>>>>> struct page_info describes MFNs, when you want to describe GFNs. As you
>>>>>> mentioned earlier, multiple GFNs can in principle map to the same MFN.
>>>>>> You would force them to all have the same properties, which would be in
>>>>>> direct conflict with e.g. the grant P2M types.
>>>>>>
>>>>>> Just to mention one possible alternative to using radix trees: You could
>>>>>> maintain a 2nd set of intermediate "page tables", just that leaf entries
>>>>>> would hold meta data for the respective GFN. The memory for those "page
>>>>>> tables" could come from the normal P2M pool (and allocation would thus
>>>>>> only consume domain-specific resources). Of course in any model like
>>>>>> this the question of lookup times (as mentioned above) would remain.
>>>>> ...looks like an optimal option.
>>>>>
>>>>> The only thing I worry about is that it will require some code duplication
>>>>> (I will think how to re-use the current one code), as for example, when
>>>>> setting/getting metadata, TLB flushing isn’t needed at all as we aren't
>>>>> working with with real P2M page tables.
>>>>> Agree that lookup won't be the best one, but nothing can be done with
>>>>> such models.
>>>> Probably, instead of having a second set of intermediate "page tables",
>>>> we could just allocate two consecutive pages within the real P2M page
>>>> tables for the intermediate page table. The first page would serve as
>>>> the actual page table to which the intermediate page table points,
>>>> and the second page would store metadata for each entry of the page
>>>> table that the intermediate page table references.
>>>>
>>>> As we are supporting only 1gb, 2mb and 4kb mappings we could do a little
>>>> optimization and start allocate these consecutive pages only for PT levels
>>>> which corresponds to 1gb, 2mb, 4kb mappings.
>>>>
>>>> Does it make sense?
>>> I was indeed entertaining this idea, but I couldn't conclude for myself if
>>> that would indeed be without any rough edges. Hence I didn't want to
>>> suggest such. For example, the need to have adjacent pairs of pages could
>>> result in a higher rate of allocation failures (while populating or
>>> re-sizing the P2M pool). This would be possible to avoid by still using
>>> entirely separate pages, and then merely linking them together via some
>>> unused struct page_info fields (the "normal" linking fields can't be used,
>>> afaict).
>> I think that all the fields are used, so it will be needed to introduce new
>> "struct page_list_entry metadata_list;".
> All the fields are used _somewhere_, sure. But once you have allocated a
> page (and that page isn't assigned to a domain), you control what the
> fields are used for.

I thought that the whole idea is to use domain's pages from P2M pool freelist,
pages for which is allocated by alloc_domheap_page(d, MEMF_no_owner), so an
allocated page is assigned to a domain.

I assume that I have in this case to take some pages for an intermediate page
table from freelist P2M pool, set an owner domain to NULL (pg->inuse.domain = NULL).

Then in this case it isn't clear why pg->list can't be re-used to link several pages
for intermediate page table purposes + metadata? Is it because pg->list can be not
empty? In this case it isn't clear if I could use a page, which has threaded pages.

page_info->count_info can't be re-used as it will break put_page_*() connected stuff.
And for similar reason page_info->v.{...} can't be re-used as then page_get_owner()
will be broken.
And page_info->tlbflush_timestamp still need for a common alloc algo to handle when
to do TLB flush.

So if I will add something to page_info->v.{...} or page_info->u.{...} then mentioned
above functions can't be used anymore for such pages which are used for intermediate
page tables.

>   Or else enlisting pages on private lists wouldn't be
> legitimate either.

Hmm, but I still should have link several pages somehow.
Or you meant just to have a field which just store a physical address to metadata?
(what still looks like a list)


>
>> Can't we introduce new PGT_METADATA type and then add metadata page to
>> struct page_info->list and when a metadata will be needed just iterate through
>> page_info->list and find a page with PGT_METADATA type?
> I'd be careful with the introduction of new page types. All handling of
> page types everywhere in the (affected part of the) code base would then
> need auditing.

~ Oleksii

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 3 months, 1 week ago

On 22.07.2025 18:07, Oleksii Kurochko wrote:
> 
> On 7/22/25 4:35 PM, Jan Beulich wrote:
>> On 22.07.2025 16:25, Oleksii Kurochko wrote:
>>> On 7/22/25 2:00 PM, Jan Beulich wrote:
>>>> On 22.07.2025 13:34, Oleksii Kurochko wrote:
>>>>> On 7/22/25 12:41 PM, Oleksii Kurochko wrote:
>>>>>> On 7/21/25 2:18 PM, Jan Beulich wrote:
>>>>>>> On 18.07.2025 11:52, Oleksii Kurochko wrote:
>>>>>>>> On 7/17/25 12:25 PM, Jan Beulich wrote:
>>>>>>>>> On 17.07.2025 10:56, Oleksii Kurochko wrote:
>>>>>>>>>> On 7/16/25 6:18 PM, Jan Beulich wrote:
>>>>>>>>>>> On 16.07.2025 18:07, Oleksii Kurochko wrote:
>>>>>>>>>>>> On 7/16/25 1:31 PM, Jan Beulich wrote:
>>>>>>>>>>>>> On 15.07.2025 16:47, Oleksii Kurochko wrote:
>>>>>>>>>>>>>> On 7/1/25 5:08 PM, Jan Beulich wrote:
>>>>>>>>>>>>>>> On 10.06.2025 15:05, Oleksii Kurochko wrote:
>>>>>>>>>>>>>>>> --- a/xen/arch/riscv/p2m.c
>>>>>>>>>>>>>>>> +++ b/xen/arch/riscv/p2m.c
>>>>>>>>>>>>>>>> @@ -345,6 +345,26 @@ static pte_t *p2m_get_root_pointer(struct p2m_domain *p2m, gfn_t gfn)
>>>>>>>>>>>>>>>>             return __map_domain_page(p2m->root + root_table_indx);
>>>>>>>>>>>>>>>>         }
>>>>>>>>>>>>>>>>         
>>>>>>>>>>>>>>>> +static int p2m_type_radix_set(struct p2m_domain *p2m, pte_t pte, p2m_type_t t)
>>>>>>>>>>>>>>> See comments on the earlier patch regarding naming.
>>>>>>>>>>>>>>>
>>>>>>>>>>>>>>>> +{
>>>>>>>>>>>>>>>> +    int rc;
>>>>>>>>>>>>>>>> +    gfn_t gfn = mfn_to_gfn(p2m->domain, mfn_from_pte(pte));
>>>>>>>>>>>>>>> How does this work, when you record GFNs only for Xenheap pages?
>>>>>>>>>>>>>> I think I don't understand what is an issue. Could you please provide
>>>>>>>>>>>>>> some extra details?
>>>>>>>>>>>>> Counter question: The mfn_to_gfn() you currently have is only a stub. It only
>>>>>>>>>>>>> works for 1:1 mapped domains. Can you show me the eventual final implementation
>>>>>>>>>>>>> of the function, making it possible to use it here?
>>>>>>>>>>>> At the moment, I planned to support only 1:1 mapped domains, so it is final
>>>>>>>>>>>> implementation.
>>>>>>>>>>> Isn't that on overly severe limitation?
>>>>>>>>>> I wouldn't say that it's a severe limitation, as it's just a matter of how
>>>>>>>>>> |mfn_to_gfn()| is implemented. When non-1:1 mapped domains are supported,
>>>>>>>>>> |mfn_to_gfn()| can be implemented differently, while the code where it’s called
>>>>>>>>>> will likely remain unchanged.
>>>>>>>>>>
>>>>>>>>>> What I meant in my reply is that, for the current state and current limitations,
>>>>>>>>>> this is the final implementation of|mfn_to_gfn()|. But that doesn't mean I don't
>>>>>>>>>> see the value in, or the need for, non-1:1 mapped domains—it's just that this
>>>>>>>>>> limitation simplifies development at the current stage of the RISC-V port.
>>>>>>>>> Simplification is fine in some cases, but not supporting the "normal" way of
>>>>>>>>> domain construction looks like a pretty odd restriction. I'm also curious
>>>>>>>>> how you envision to implement mfn_to_gfn() then, suitable for generic use like
>>>>>>>>> the one here. Imo, current limitation or not, you simply want to avoid use of
>>>>>>>>> that function outside of the special gnttab case.
>>>>>>>>>
>>>>>>>>>>>>>>> In this context (not sure if I asked before): With this use of a radix tree,
>>>>>>>>>>>>>>> how do you intend to bound the amount of memory that a domain can use, by
>>>>>>>>>>>>>>> making Xen insert very many entries?
>>>>>>>>>>>>>> I didn’t think about that. I assumed it would be enough to set the amount of
>>>>>>>>>>>>>> memory a guest domain can use by specifying|xen,domain-p2m-mem-mb| in the DTS,
>>>>>>>>>>>>>> or using some predefined value if|xen,domain-p2m-mem-mb| isn’t explicitly set.
>>>>>>>>>>>>> Which would require these allocations to come from that pool.
>>>>>>>>>>>> Yes, and it is true only for non-hardware domains with the current implementation.
>>>>>>>>>>> ???
>>>>>>>>>> I meant that pool is used now only for non-hardware domains at the moment.
>>>>>>>>> And how does this matter here? The memory required for the radix tree doesn't
>>>>>>>>> come from that pool anyway.
>>>>>>>> I thought that is possible to do that somehow, but looking at a code of
>>>>>>>> radix-tree.c it seems like the only one way to allocate memroy for the radix
>>>>>>>> tree isradix_tree_node_alloc() -> xzalloc(struct rcu_node).
>>>>>>>>
>>>>>>>> Then it is needed to introduce radix_tree_node_allocate(domain)
>>>>>>> That would be a possibility, but you may have seen that less than half a
>>>>>>> year ago we got rid of something along these lines. So it would require
>>>>>>> some pretty good justification to re-introduce.
>>>>>>>
>>>>>>>> or radix tree
>>>>>>>> can't be used at all for mentioned in the previous replies security reason, no?
>>>>>>> (Very) careful use may still be possible. But the downside of using this
>>>>>>> (potentially long lookup times) would always remain.
>>>>>> Could you please clarify what do you mean here by "(Very) careful"?
>>>>>> I thought about an introduction of an amount of possible keys in radix tree and if this amount
>>>>>> is 0 then stop domain. And it is also unclear what should be a value for this amount.
>>>>>> Probably, you have better idea.
>>>>>>
>>>>>> But generally your idea below ...
>>>>>>>>>>>>>> Also, it seems this would just lead to the issue you mentioned earlier: when
>>>>>>>>>>>>>> the memory runs out,|domain_crash()| will be called or PTE will be zapped.
>>>>>>>>>>>>> Or one domain exhausting memory would cause another domain to fail. A domain
>>>>>>>>>>>>> impacting just itself may be tolerable. But a domain affecting other domains
>>>>>>>>>>>>> isn't.
>>>>>>>>>>>> But it seems like this issue could happen in any implementation. It won't happen only
>>>>>>>>>>>> if we will have only pre-populated pool for any domain type (hardware, control, guest
>>>>>>>>>>>> domain) without ability to extend them or allocate extra pages from domheap in runtime.
>>>>>>>>>>>> Otherwise, if extra pages allocation is allowed then we can't really do something
>>>>>>>>>>>> with this issue.
>>>>>>>>>>> But that's why I brought this up: You simply have to. Or, as indicated, the
>>>>>>>>>>> moment you mark Xen security-supported on RISC-V, there will be an XSA needed.
>>>>>>>>>> Why it isn't XSA for other architectures? At least, Arm then should have such
>>>>>>>>>> XSA.
>>>>>>>>> Does Arm use a radix tree for storing types? It uses one for mem-access, but
>>>>>>>>> it's not clear to me whether that's actually a supported feature.
>>>>>>>>>
>>>>>>>>>> I don't understand why x86 won't have the same issue. Memory is the limited
>>>>>>>>>> and shared resource, so if one of the domain will use to much memory then it could
>>>>>>>>>> happen that other domains won't have enough memory for its purpose...
>>>>>>>>> The question is whether allocations are bounded. With this use of a radix tree,
>>>>>>>>> you give domains a way to have Xen allocate pretty much arbitrary amounts of
>>>>>>>>> memory to populate that tree. That unbounded-ness is the problem, not memory
>>>>>>>>> allocations in general.
>>>>>>>> Isn't radix tree key bounded to an amount of GFNs given for a domain? We can't have
>>>>>>>> more keys then a max GFN number for a domain. So a potential amount of necessary memory
>>>>>>>> for radix tree is also bounded to an amount of GFNs.
>>>>>>> To some degree yes, hence why I said "pretty much arbitrary amounts".
>>>>>>> But recall that "amount of GFNs" is a fuzzy term; I think you mean to
>>>>>>> use it to describe the amount of memory pages given to the guest. GFNs
>>>>>>> can be used for other purposes, though. Guests could e.g. grant
>>>>>>> themselves access to their own memory, then map those grants at
>>>>>>> otherwise unused GFNs.
>>>>>>>
>>>>>>>> Anyway, IIUC I just can't use radix tree for p2m types at all, right?
>>>>>>>> If yes, does it make sense to borrow 2 bits from struct page_info->type_info as now it
>>>>>>>> is used 9-bits for count of a frame?
>>>>>>> struct page_info describes MFNs, when you want to describe GFNs. As you
>>>>>>> mentioned earlier, multiple GFNs can in principle map to the same MFN.
>>>>>>> You would force them to all have the same properties, which would be in
>>>>>>> direct conflict with e.g. the grant P2M types.
>>>>>>>
>>>>>>> Just to mention one possible alternative to using radix trees: You could
>>>>>>> maintain a 2nd set of intermediate "page tables", just that leaf entries
>>>>>>> would hold meta data for the respective GFN. The memory for those "page
>>>>>>> tables" could come from the normal P2M pool (and allocation would thus
>>>>>>> only consume domain-specific resources). Of course in any model like
>>>>>>> this the question of lookup times (as mentioned above) would remain.
>>>>>> ...looks like an optimal option.
>>>>>>
>>>>>> The only thing I worry about is that it will require some code duplication
>>>>>> (I will think how to re-use the current one code), as for example, when
>>>>>> setting/getting metadata, TLB flushing isn’t needed at all as we aren't
>>>>>> working with with real P2M page tables.
>>>>>> Agree that lookup won't be the best one, but nothing can be done with
>>>>>> such models.
>>>>> Probably, instead of having a second set of intermediate "page tables",
>>>>> we could just allocate two consecutive pages within the real P2M page
>>>>> tables for the intermediate page table. The first page would serve as
>>>>> the actual page table to which the intermediate page table points,
>>>>> and the second page would store metadata for each entry of the page
>>>>> table that the intermediate page table references.
>>>>>
>>>>> As we are supporting only 1gb, 2mb and 4kb mappings we could do a little
>>>>> optimization and start allocate these consecutive pages only for PT levels
>>>>> which corresponds to 1gb, 2mb, 4kb mappings.
>>>>>
>>>>> Does it make sense?
>>>> I was indeed entertaining this idea, but I couldn't conclude for myself if
>>>> that would indeed be without any rough edges. Hence I didn't want to
>>>> suggest such. For example, the need to have adjacent pairs of pages could
>>>> result in a higher rate of allocation failures (while populating or
>>>> re-sizing the P2M pool). This would be possible to avoid by still using
>>>> entirely separate pages, and then merely linking them together via some
>>>> unused struct page_info fields (the "normal" linking fields can't be used,
>>>> afaict).
>>> I think that all the fields are used, so it will be needed to introduce new
>>> "struct page_list_entry metadata_list;".
>> All the fields are used _somewhere_, sure. But once you have allocated a
>> page (and that page isn't assigned to a domain), you control what the
>> fields are used for.
> 
> I thought that the whole idea is to use domain's pages from P2M pool freelist,
> pages for which is allocated by alloc_domheap_page(d, MEMF_no_owner), so an
> allocated page is assigned to a domain.

You did check what effect MEMF_no_owner has, didn't you? Such pages are _not_
assigned to the domain.

> I assume that I have in this case to take some pages for an intermediate page
> table from freelist P2M pool, set an owner domain to NULL (pg->inuse.domain = NULL).
> 
> Then in this case it isn't clear why pg->list can't be re-used to link several pages
> for intermediate page table purposes + metadata? Is it because pg->list can be not
> empty? In this case it isn't clear if I could use a page, which has threaded pages.

Actually looks like I was mis-remembering. Pages removed from freelist indeed
aren't put on any other list, so the linking fields are available for use. I
guess I had x86 shadow code in mind, where the linking fields are further used.

Jan

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 3 months ago

On 7/23/25 11:46 AM, Jan Beulich wrote:
>> I assume that I have in this case to take some pages for an intermediate page
>> table from freelist P2M pool, set an owner domain to NULL (pg->inuse.domain = NULL).
>>
>> Then in this case it isn't clear why pg->list can't be re-used to link several pages
>> for intermediate page table purposes + metadata? Is it because pg->list can be not
>> empty? In this case it isn't clear if I could use a page, which has threaded pages.
> Actually looks like I was mis-remembering. Pages removed from freelist indeed
> aren't put on any other list, so the linking fields are available for use. I
> guess I had x86 shadow code in mind, where the linking fields are further used.

Perhaps, I misunderstood you about "linking fields", but it seems like I can't reuse
struct page_info->list as it is used by page_list_add() which is called by p2m_alloc_page()
to allocate page(s) for an intermediate page table:
    static inline void
    page_list_add(struct page_info *page, struct page_list_head *head)
    {
         list_add(&page->list, head);
    }

     struct page_info * paging_alloc_page(struct domain *d)
     {
         struct page_info *pg;

         spin_lock(&d->arch.paging.lock);
         pg = page_list_remove_head(&d->arch.paging.freelist);
         spin_unlock(&d->arch.paging.lock);

         INIT_LIST_HEAD(&pg->list);

         return pg;
     }

     static struct page_info *p2m_alloc_page(struct domain *d)
     {
         struct page_info *pg = paging_alloc_page(d);

         if ( pg )
             page_list_add(pg, &p2m_get_hostp2m(d)->pages);

         return pg;
     }

So I have to reuse another field from struct page_info. It seems like it won't be an
issue if to add a new struct page_list_entry metadata_list to 'union v':
     union {
         /* Page is in use */
         struct {
             /* Owner of this page (NULL if page is anonymous). */
             struct domain *domain;
         } inuse;

         /* Page is on a free list. */
         struct {
             /* Order-size of the free chunk this page is the head of. */
             unsigned int order;
         } free;
+
+       struct page_list_entry metadata_list;
     } v;

Am I missing something?

~ Oleksii

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 3 months ago

On 28.07.2025 10:52, Oleksii Kurochko wrote:
> On 7/23/25 11:46 AM, Jan Beulich wrote:
>>> I assume that I have in this case to take some pages for an intermediate page
>>> table from freelist P2M pool, set an owner domain to NULL (pg->inuse.domain = NULL).
>>>
>>> Then in this case it isn't clear why pg->list can't be re-used to link several pages
>>> for intermediate page table purposes + metadata? Is it because pg->list can be not
>>> empty? In this case it isn't clear if I could use a page, which has threaded pages.
>> Actually looks like I was mis-remembering. Pages removed from freelist indeed
>> aren't put on any other list, so the linking fields are available for use. I
>> guess I had x86 shadow code in mind, where the linking fields are further used.
> 
> Perhaps, I misunderstood you about "linking fields", but it seems like I can't reuse
> struct page_info->list as it is used by page_list_add() which is called by p2m_alloc_page()
> to allocate page(s) for an intermediate page table:
>     static inline void
>     page_list_add(struct page_info *page, struct page_list_head *head)
>     {
>          list_add(&page->list, head);
>     }
> 
>      struct page_info * paging_alloc_page(struct domain *d)
>      {
>          struct page_info *pg;
> 
>          spin_lock(&d->arch.paging.lock);
>          pg = page_list_remove_head(&d->arch.paging.freelist);
>          spin_unlock(&d->arch.paging.lock);
> 
>          INIT_LIST_HEAD(&pg->list);
> 
>          return pg;
>      }
> 
>      static struct page_info *p2m_alloc_page(struct domain *d)
>      {
>          struct page_info *pg = paging_alloc_page(d);
> 
>          if ( pg )
>              page_list_add(pg, &p2m_get_hostp2m(d)->pages);
> 
>          return pg;
>      }
> 
> So I have to reuse another field from struct page_info. It seems like it won't be an
> issue if to add a new struct page_list_entry metadata_list to 'union v':
>      union {
>          /* Page is in use */
>          struct {
>              /* Owner of this page (NULL if page is anonymous). */
>              struct domain *domain;
>          } inuse;
> 
>          /* Page is on a free list. */
>          struct {
>              /* Order-size of the free chunk this page is the head of. */
>              unsigned int order;
>          } free;
> +
> +       struct page_list_entry metadata_list;
>      } v;
> 
> Am I missing something?

Well, you're doubling the size of that union then, aren't you? As was mentioned
quite some time ago, struct page_info needs quite a bit of care when you mean
to add new fields there. Question is whether for the purpose here you actually
need a doubly-linked list. A single pointer would be fine to put there.

Jan

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Oleksii Kurochko 3 months ago

On 7/28/25 11:09 AM, Jan Beulich wrote:
> On 28.07.2025 10:52, Oleksii Kurochko wrote:
>> On 7/23/25 11:46 AM, Jan Beulich wrote:
>>>> I assume that I have in this case to take some pages for an intermediate page
>>>> table from freelist P2M pool, set an owner domain to NULL (pg->inuse.domain = NULL).
>>>>
>>>> Then in this case it isn't clear why pg->list can't be re-used to link several pages
>>>> for intermediate page table purposes + metadata? Is it because pg->list can be not
>>>> empty? In this case it isn't clear if I could use a page, which has threaded pages.
>>> Actually looks like I was mis-remembering. Pages removed from freelist indeed
>>> aren't put on any other list, so the linking fields are available for use. I
>>> guess I had x86 shadow code in mind, where the linking fields are further used.
>> Perhaps, I misunderstood you about "linking fields", but it seems like I can't reuse
>> struct page_info->list as it is used by page_list_add() which is called by p2m_alloc_page()
>> to allocate page(s) for an intermediate page table:
>>      static inline void
>>      page_list_add(struct page_info *page, struct page_list_head *head)
>>      {
>>           list_add(&page->list, head);
>>      }
>>
>>       struct page_info * paging_alloc_page(struct domain *d)
>>       {
>>           struct page_info *pg;
>>
>>           spin_lock(&d->arch.paging.lock);
>>           pg = page_list_remove_head(&d->arch.paging.freelist);
>>           spin_unlock(&d->arch.paging.lock);
>>
>>           INIT_LIST_HEAD(&pg->list);
>>
>>           return pg;
>>       }
>>
>>       static struct page_info *p2m_alloc_page(struct domain *d)
>>       {
>>           struct page_info *pg = paging_alloc_page(d);
>>
>>           if ( pg )
>>               page_list_add(pg, &p2m_get_hostp2m(d)->pages);
>>
>>           return pg;
>>       }
>>
>> So I have to reuse another field from struct page_info. It seems like it won't be an
>> issue if to add a new struct page_list_entry metadata_list to 'union v':
>>       union {
>>           /* Page is in use */
>>           struct {
>>               /* Owner of this page (NULL if page is anonymous). */
>>               struct domain *domain;
>>           } inuse;
>>
>>           /* Page is on a free list. */
>>           struct {
>>               /* Order-size of the free chunk this page is the head of. */
>>               unsigned int order;
>>           } free;
>> +
>> +       struct page_list_entry metadata_list;
>>       } v;
>>
>> Am I missing something?
> Well, you're doubling the size of that union then, aren't you? As was mentioned
> quite some time ago, struct page_info needs quite a bit of care when you mean
> to add new fields there. Question is whether for the purpose here you actually
> need a doubly-linked list. A single pointer would be fine to put there.

Agree, a single pointer will be more then enough.

I'm thinking if it is possible to do something with the case if someone will try
to use:
   #define page_get_owner(p)    (p)->v.inuse.domain
for a page which was allocated for metadata storage. Shouldn't I have a separate
list for such pages and a macro which will check if a page is in this list?
Similar a list which we have for p2m pages in struct p2m_domain:
     ...
     /* Pages used to construct the p2m */
     struct page_list_head pages;
     ...

Of course, such pages are allocated by alloc_domheap_page(d, MEMF_no_owner),
so there is no owner. But if someone will accidentally use this macro for such
pages then it will be an issue as ->domain likely won't be a NULL anymore.

~ Oleksii

Re: [PATCH v2 13/17] xen/riscv: Implement p2m_entry_from_mfn() and support PBMT configuration

Posted by Jan Beulich 3 months ago

On 28.07.2025 13:37, Oleksii Kurochko wrote:
> 
> On 7/28/25 11:09 AM, Jan Beulich wrote:
>> On 28.07.2025 10:52, Oleksii Kurochko wrote:
>>> On 7/23/25 11:46 AM, Jan Beulich wrote:
>>>>> I assume that I have in this case to take some pages for an intermediate page
>>>>> table from freelist P2M pool, set an owner domain to NULL (pg->inuse.domain = NULL).
>>>>>
>>>>> Then in this case it isn't clear why pg->list can't be re-used to link several pages
>>>>> for intermediate page table purposes + metadata? Is it because pg->list can be not
>>>>> empty? In this case it isn't clear if I could use a page, which has threaded pages.
>>>> Actually looks like I was mis-remembering. Pages removed from freelist indeed
>>>> aren't put on any other list, so the linking fields are available for use. I
>>>> guess I had x86 shadow code in mind, where the linking fields are further used.
>>> Perhaps, I misunderstood you about "linking fields", but it seems like I can't reuse
>>> struct page_info->list as it is used by page_list_add() which is called by p2m_alloc_page()
>>> to allocate page(s) for an intermediate page table:
>>>      static inline void
>>>      page_list_add(struct page_info *page, struct page_list_head *head)
>>>      {
>>>           list_add(&page->list, head);
>>>      }
>>>
>>>       struct page_info * paging_alloc_page(struct domain *d)
>>>       {
>>>           struct page_info *pg;
>>>
>>>           spin_lock(&d->arch.paging.lock);
>>>           pg = page_list_remove_head(&d->arch.paging.freelist);
>>>           spin_unlock(&d->arch.paging.lock);
>>>
>>>           INIT_LIST_HEAD(&pg->list);
>>>
>>>           return pg;
>>>       }
>>>
>>>       static struct page_info *p2m_alloc_page(struct domain *d)
>>>       {
>>>           struct page_info *pg = paging_alloc_page(d);
>>>
>>>           if ( pg )
>>>               page_list_add(pg, &p2m_get_hostp2m(d)->pages);
>>>
>>>           return pg;
>>>       }
>>>
>>> So I have to reuse another field from struct page_info. It seems like it won't be an
>>> issue if to add a new struct page_list_entry metadata_list to 'union v':
>>>       union {
>>>           /* Page is in use */
>>>           struct {
>>>               /* Owner of this page (NULL if page is anonymous). */
>>>               struct domain *domain;
>>>           } inuse;
>>>
>>>           /* Page is on a free list. */
>>>           struct {
>>>               /* Order-size of the free chunk this page is the head of. */
>>>               unsigned int order;
>>>           } free;
>>> +
>>> +       struct page_list_entry metadata_list;
>>>       } v;
>>>
>>> Am I missing something?
>> Well, you're doubling the size of that union then, aren't you? As was mentioned
>> quite some time ago, struct page_info needs quite a bit of care when you mean
>> to add new fields there. Question is whether for the purpose here you actually
>> need a doubly-linked list. A single pointer would be fine to put there.
> 
> Agree, a single pointer will be more then enough.
> 
> I'm thinking if it is possible to do something with the case if someone will try
> to use:
>    #define page_get_owner(p)    (p)->v.inuse.domain
> for a page which was allocated for metadata storage. Shouldn't I have a separate
> list for such pages and a macro which will check if a page is in this list?
> Similar a list which we have for p2m pages in struct p2m_domain:
>      ...
>      /* Pages used to construct the p2m */
>      struct page_list_head pages;
>      ...
> 
> Of course, such pages are allocated by alloc_domheap_page(d, MEMF_no_owner),
> so there is no owner. But if someone will accidentally use this macro for such
> pages then it will be an issue as ->domain likely won't be a NULL anymore.

It's the nature of using unions that such a risk exists. Take a look at x86'es
structure, where several of the fields are re-purposed for shadow pages. It's
something similar you'd do here, in the end.

Jan