[PATCH v2 3/3] xen/riscv: update mfn calculation in pt_mapping_level()

Oleksii Kurochko posted 3 patches 1 month, 1 week ago
There is a newer version of this series
[PATCH v2 3/3] xen/riscv: update mfn calculation in pt_mapping_level()
Posted by Oleksii Kurochko 1 month, 1 week ago
When pt_update() is called with arguments (..., INVALID_MFN, ..., 0 or 1),
it indicates that a mapping is being destroyed/modifyed.

In the case when modifying or destroying a mapping, it is necessary to
search until a leaf node is found, instead of searching for a page table
entry based on the precalculated `level` and `order` returned from pt_update().
This is because when `mfn` == INVALID_MFN, the `mask` (in pt_mapping_level())
will take into account only `vfn`, which could accidentally return an
incorrect level, leading to the discovery of an incorrect page table entry.

For example, if `vfn` is page table level 1 aligned, but it was mapped as
page table level 0, then pt_mapping_level() will return `level` = 1, since
only `vfn` (which is page table level 1 aligned) is taken into account when
`mfn` == INVALID_MFN (look at pt_mapping_level()).

Fixes: c2f1ded524 ("xen/riscv: page table handling")
Signed-off-by: Oleksii Kurochko <oleksii.kurochko@gmail.com>
---
Changes in v2:
 - Introduce PTE_LEAF_SEARCH to tell page table update operation to
   walk down to wherever the leaf entry is.
 - Use introduced PTE_LEAF_SEARCH to not searching pte_t entry twice.
 - Update the commit message.
---
 xen/arch/riscv/include/asm/page.h | 16 ++++++
 xen/arch/riscv/pt.c               | 87 +++++++++++++++++++------------
 2 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/xen/arch/riscv/include/asm/page.h b/xen/arch/riscv/include/asm/page.h
index b9076173f4..72d29376bc 100644
--- a/xen/arch/riscv/include/asm/page.h
+++ b/xen/arch/riscv/include/asm/page.h
@@ -55,6 +55,22 @@
 #define PTE_SMALL       BIT(10, UL)
 #define PTE_POPULATE    BIT(11, UL)
 
+/*
+ * In the case when modifying or destroying a mapping, it is necessary to
+ * search until a leaf node is found, instead of searching for a page table
+ * entry based on the precalculated `level` and `order` (look at pt_update()).
+ * This is because when `mfn` == INVALID_MFN, the `mask`(in pt_mapping_level())
+ * will take into account only `vfn`, which could accidentally return an
+ * incorrect level, leading to the discovery of an incorrect page table entry.
+ *
+ * For example, if `vfn` is page table level 1 aligned, but it was mapped as
+ * page table level 0, then pt_mapping_level() will return `level` = 1,
+ * since only `vfn` (which is page table level 1 aligned) is taken into account
+ * when `mfn` == INVALID_MFN (look at pt_mapping_level()).
+ */
+
+#define PTE_LEAF_SEARCH BIT(12, UL)
+
 #define PTE_ACCESS_MASK (PTE_READABLE | PTE_WRITABLE | PTE_EXECUTABLE)
 
 /* Calculate the offsets into the pagetables for a given VA */
diff --git a/xen/arch/riscv/pt.c b/xen/arch/riscv/pt.c
index 2a5a191a70..9db41eac53 100644
--- a/xen/arch/riscv/pt.c
+++ b/xen/arch/riscv/pt.c
@@ -187,11 +187,10 @@ static int pt_next_level(bool alloc_tbl, pte_t **table, unsigned int offset)
 
 /* Update an entry at the level @target. */
 static int pt_update_entry(mfn_t root, vaddr_t virt,
-                           mfn_t mfn, unsigned int target,
+                           mfn_t mfn, unsigned int *target,
                            unsigned int flags)
 {
     int rc;
-    unsigned int level = HYP_PT_ROOT_LEVEL;
     pte_t *table;
     /*
      * The intermediate page table shouldn't be allocated when MFN isn't
@@ -205,39 +204,48 @@ static int pt_update_entry(mfn_t root, vaddr_t virt,
     bool alloc_tbl = !mfn_eq(mfn, INVALID_MFN) || (flags & PTE_POPULATE);
     pte_t pte, *entry;
 
-    /* convenience aliases */
-    DECLARE_OFFSETS(offsets, virt);
-
-    table = map_table(root);
-    for ( ; level > target; level-- )
+    if ( flags & PTE_LEAF_SEARCH )
     {
-        rc = pt_next_level(alloc_tbl, &table, offsets[level]);
-        if ( rc == XEN_TABLE_MAP_NOMEM )
+        entry = pt_walk(virt, target);
+        BUG_ON(!pte_is_mapping(*entry));
+    }
+    else
+    {
+        unsigned int level = HYP_PT_ROOT_LEVEL;
+        /* convenience aliases */
+        DECLARE_OFFSETS(offsets, virt);
+
+        table = map_table(root);
+        for ( ; level > *target; level-- )
         {
-            rc = -ENOMEM;
-            goto out;
+            rc = pt_next_level(alloc_tbl, &table, offsets[level]);
+            if ( rc == XEN_TABLE_MAP_NOMEM )
+            {
+                rc = -ENOMEM;
+                goto out;
+            }
+
+            if ( rc == XEN_TABLE_MAP_NONE )
+            {
+                rc = 0;
+                goto out;
+            }
+
+            if ( rc != XEN_TABLE_NORMAL )
+                break;
         }
 
-        if ( rc == XEN_TABLE_MAP_NONE )
+        if ( level != *target )
         {
-            rc = 0;
+            dprintk(XENLOG_ERR,
+                    "%s: Shattering superpage is not supported\n", __func__);
+            rc = -EOPNOTSUPP;
             goto out;
         }
 
-        if ( rc != XEN_TABLE_NORMAL )
-            break;
+        entry = table + offsets[level];
     }
 
-    if ( level != target )
-    {
-        dprintk(XENLOG_ERR,
-                "%s: Shattering superpage is not supported\n", __func__);
-        rc = -EOPNOTSUPP;
-        goto out;
-    }
-
-    entry = table + offsets[level];
-
     rc = -EINVAL;
     if ( !pt_check_entry(*entry, mfn, flags) )
         goto out;
@@ -345,9 +353,6 @@ static int pt_mapping_level(unsigned long vfn, mfn_t mfn, unsigned long nr,
         return level;
 
     /*
-     * Don't take into account the MFN when removing mapping (i.e
-     * MFN_INVALID) to calculate the correct target order.
-     *
      * `vfn` and `mfn` must be both superpage aligned.
      * They are or-ed together and then checked against the size of
      * each level.
@@ -415,19 +420,33 @@ static int pt_update(vaddr_t virt, mfn_t mfn,
 
     spin_lock(&pt_lock);
 
-    while ( left )
+    /* look at the comment above the definition of PTE_LEAF_SEARCH */
+    if ( mfn_eq(mfn, INVALID_MFN) && !(flags & PTE_POPULATE) )
     {
-        unsigned int order, level;
+        flags |= PTE_LEAF_SEARCH;
+    }
 
-        level = pt_mapping_level(vfn, mfn, left, flags);
-        order = XEN_PT_LEVEL_ORDER(level);
+    while ( left )
+    {
+        unsigned int order = 0, level;
 
-        ASSERT(left >= BIT(order, UL));
+        if ( !(flags & PTE_LEAF_SEARCH) )
+        {
+            level = pt_mapping_level(vfn, mfn, left, flags);
+            order = XEN_PT_LEVEL_ORDER(level);
+            ASSERT(left >= BIT(order, UL));
+        }
 
-        rc = pt_update_entry(root, vfn << PAGE_SHIFT, mfn, level, flags);
+        rc = pt_update_entry(root, vfn << PAGE_SHIFT, mfn, &level, flags);
         if ( rc )
             break;
 
+        if ( flags & PTE_LEAF_SEARCH )
+        {
+            order = XEN_PT_LEVEL_ORDER(level);
+            ASSERT(left >= BIT(order, UL));
+        }
+
         vfn += 1UL << order;
         if ( !mfn_eq(mfn, INVALID_MFN) )
             mfn = mfn_add(mfn, 1UL << order);
-- 
2.48.1
Re: [PATCH v2 3/3] xen/riscv: update mfn calculation in pt_mapping_level()
Posted by Jan Beulich 1 month, 1 week ago
On 03.02.2025 14:12, Oleksii Kurochko wrote:
> --- a/xen/arch/riscv/include/asm/page.h
> +++ b/xen/arch/riscv/include/asm/page.h
> @@ -55,6 +55,22 @@
>  #define PTE_SMALL       BIT(10, UL)
>  #define PTE_POPULATE    BIT(11, UL)
>  
> +/*
> + * In the case when modifying or destroying a mapping, it is necessary to
> + * search until a leaf node is found, instead of searching for a page table
> + * entry based on the precalculated `level` and `order` (look at pt_update()).
> + * This is because when `mfn` == INVALID_MFN, the `mask`(in pt_mapping_level())
> + * will take into account only `vfn`, which could accidentally return an
> + * incorrect level, leading to the discovery of an incorrect page table entry.
> + *
> + * For example, if `vfn` is page table level 1 aligned, but it was mapped as
> + * page table level 0, then pt_mapping_level() will return `level` = 1,
> + * since only `vfn` (which is page table level 1 aligned) is taken into account
> + * when `mfn` == INVALID_MFN (look at pt_mapping_level()).
> + */
> +
> +#define PTE_LEAF_SEARCH BIT(12, UL)

Is it intended for callers outside of pt.c to make use of this? If not,
it better wouldn't be globally exposed.

Furthermore, this isn't a property of the PTE(s) to be created, so is
likely wrong to mix with PTE_* flags. (PTE_POPULATE is on the edge of
also falling in this category, btw.) Perhaps ...

> --- a/xen/arch/riscv/pt.c
> +++ b/xen/arch/riscv/pt.c
> @@ -187,11 +187,10 @@ static int pt_next_level(bool alloc_tbl, pte_t **table, unsigned int offset)
>  
>  /* Update an entry at the level @target. */
>  static int pt_update_entry(mfn_t root, vaddr_t virt,
> -                           mfn_t mfn, unsigned int target,
> +                           mfn_t mfn, unsigned int *target,

... you instead want to have callers of this function preset *target
to a special value (e.g. UINT_MAX or CONFIG_PAGING_LEVELS) indicating
the level is wanted as an output.

> @@ -205,39 +204,48 @@ static int pt_update_entry(mfn_t root, vaddr_t virt,
>      bool alloc_tbl = !mfn_eq(mfn, INVALID_MFN) || (flags & PTE_POPULATE);
>      pte_t pte, *entry;
>  
> -    /* convenience aliases */
> -    DECLARE_OFFSETS(offsets, virt);
> -
> -    table = map_table(root);
> -    for ( ; level > target; level-- )
> +    if ( flags & PTE_LEAF_SEARCH )
>      {
> -        rc = pt_next_level(alloc_tbl, &table, offsets[level]);
> -        if ( rc == XEN_TABLE_MAP_NOMEM )
> +        entry = pt_walk(virt, target);
> +        BUG_ON(!pte_is_mapping(*entry));

Is this really necessarily a bug? Can't one want to determine how deep
the (populated) page tables are for a given VA?

Hmm, here I can see why you have pt_walk() return a pointer. As per the
comment on the earlier patch, I don't think this is a good idea. You
may want to have

static pte_t *_pt_walk(...)
{
    ...
}

pte_t pt_walk(...)
{
    return *_pt_walk(...);
}

> @@ -345,9 +353,6 @@ static int pt_mapping_level(unsigned long vfn, mfn_t mfn, unsigned long nr,
>          return level;
>  
>      /*
> -     * Don't take into account the MFN when removing mapping (i.e
> -     * MFN_INVALID) to calculate the correct target order.
> -     *
>       * `vfn` and `mfn` must be both superpage aligned.
>       * They are or-ed together and then checked against the size of
>       * each level.

You drop part of the comment without altering the code being commented.
What's the deal?

> @@ -415,19 +420,33 @@ static int pt_update(vaddr_t virt, mfn_t mfn,
>  
>      spin_lock(&pt_lock);
>  
> -    while ( left )
> +    /* look at the comment above the definition of PTE_LEAF_SEARCH */
> +    if ( mfn_eq(mfn, INVALID_MFN) && !(flags & PTE_POPULATE) )
>      {
> -        unsigned int order, level;
> +        flags |= PTE_LEAF_SEARCH;
> +    }

For readability I think it would be better if the figure braces were
dropped.

> -        level = pt_mapping_level(vfn, mfn, left, flags);
> -        order = XEN_PT_LEVEL_ORDER(level);
> +    while ( left )
> +    {
> +        unsigned int order = 0, level;
>  
> -        ASSERT(left >= BIT(order, UL));
> +        if ( !(flags & PTE_LEAF_SEARCH) )
> +        {
> +            level = pt_mapping_level(vfn, mfn, left, flags);
> +            order = XEN_PT_LEVEL_ORDER(level);
> +            ASSERT(left >= BIT(order, UL));

Assignment to order and assertion are ...

> +        }
>  
> -        rc = pt_update_entry(root, vfn << PAGE_SHIFT, mfn, level, flags);
> +        rc = pt_update_entry(root, vfn << PAGE_SHIFT, mfn, &level, flags);
>          if ( rc )
>              break;
>  
> +        if ( flags & PTE_LEAF_SEARCH )
> +        {
> +            order = XEN_PT_LEVEL_ORDER(level);
> +            ASSERT(left >= BIT(order, UL));
> +        }

... repeated here, with neither left nor order being passed into
pt_update_entry(). Does this really need doing twice? (I have to
admit that I have trouble determining what the assertion is about.
For order alone it clearly could be done centrally after the call.)

Jan
Re: [PATCH v2 3/3] xen/riscv: update mfn calculation in pt_mapping_level()
Posted by Oleksii Kurochko 1 month ago
On 2/4/25 3:57 PM, Jan Beulich wrote:
> On 03.02.2025 14:12, Oleksii Kurochko wrote:
>> --- a/xen/arch/riscv/include/asm/page.h
>> +++ b/xen/arch/riscv/include/asm/page.h
>> @@ -55,6 +55,22 @@
>>   #define PTE_SMALL       BIT(10, UL)
>>   #define PTE_POPULATE    BIT(11, UL)
>>   
>> +/*
>> + * In the case when modifying or destroying a mapping, it is necessary to
>> + * search until a leaf node is found, instead of searching for a page table
>> + * entry based on the precalculated `level` and `order` (look at pt_update()).
>> + * This is because when `mfn` == INVALID_MFN, the `mask`(in pt_mapping_level())
>> + * will take into account only `vfn`, which could accidentally return an
>> + * incorrect level, leading to the discovery of an incorrect page table entry.
>> + *
>> + * For example, if `vfn` is page table level 1 aligned, but it was mapped as
>> + * page table level 0, then pt_mapping_level() will return `level` = 1,
>> + * since only `vfn` (which is page table level 1 aligned) is taken into account
>> + * when `mfn` == INVALID_MFN (look at pt_mapping_level()).
>> + */
>> +
>> +#define PTE_LEAF_SEARCH BIT(12, UL)
> Is it intended for callers outside of pt.c to make use of this? If not,
> it better wouldn't be globally exposed.
>
> Furthermore, this isn't a property of the PTE(s) to be created, so is
> likely wrong to mix with PTE_* flags. (PTE_POPULATE is on the edge of
> also falling in this category, btw.) Perhaps ...
>
>> --- a/xen/arch/riscv/pt.c
>> +++ b/xen/arch/riscv/pt.c
>> @@ -187,11 +187,10 @@ static int pt_next_level(bool alloc_tbl, pte_t **table, unsigned int offset)
>>   
>>   /* Update an entry at the level @target. */
>>   static int pt_update_entry(mfn_t root, vaddr_t virt,
>> -                           mfn_t mfn, unsigned int target,
>> +                           mfn_t mfn, unsigned int *target,
> ... you instead want to have callers of this function preset *target
> to a special value (e.g. UINT_MAX or CONFIG_PAGING_LEVELS) indicating
> the level is wanted as an output.

I thought about this way but decided to use a separate for PTE_* flag which looked to me more clearer, at
that moment. But I didn't take into account that it will be used only inside pt.c, so I agree that it should
declared locally in pt.c and used for that a special value. I will update correspondingly.

>
>> @@ -205,39 +204,48 @@ static int pt_update_entry(mfn_t root, vaddr_t virt,
>>       bool alloc_tbl = !mfn_eq(mfn, INVALID_MFN) || (flags & PTE_POPULATE);
>>       pte_t pte, *entry;
>>   
>> -    /* convenience aliases */
>> -    DECLARE_OFFSETS(offsets, virt);
>> -
>> -    table = map_table(root);
>> -    for ( ; level > target; level-- )
>> +    if ( flags & PTE_LEAF_SEARCH )
>>       {
>> -        rc = pt_next_level(alloc_tbl, &table, offsets[level]);
>> -        if ( rc == XEN_TABLE_MAP_NOMEM )
>> +        entry = pt_walk(virt, target);
>> +        BUG_ON(!pte_is_mapping(*entry));
> Is this really necessarily a bug? Can't one want to determine how deep
> the (populated) page tables are for a given VA?

pt_walk() could be used in that way but PTE_LEAF_SEARCH won't be used for page table populating:
     if ( mfn_eq(mfn, INVALID_MFN) && !(flags & PTE_POPULATE) )
     {
         flags |= PTE_LEAF_SEARCH;
     }
so in the current version of the patch only mapped VA <-> PA is expected to be in this part of the code.


>
> Hmm, here I can see why you have pt_walk() return a pointer. As per the
> comment on the earlier patch, I don't think this is a good idea. You
> may want to have
>
> static pte_t *_pt_walk(...)
> {
>      ...
> }
>
> pte_t pt_walk(...)
> {
>      return *_pt_walk(...);
> }

It would be better to do in this way.

>
>> @@ -345,9 +353,6 @@ static int pt_mapping_level(unsigned long vfn, mfn_t mfn, unsigned long nr,
>>           return level;
>>   
>>       /*
>> -     * Don't take into account the MFN when removing mapping (i.e
>> -     * MFN_INVALID) to calculate the correct target order.
>> -     *
>>        * `vfn` and `mfn` must be both superpage aligned.
>>        * They are or-ed together and then checked against the size of
>>        * each level.
> You drop part of the comment without altering the code being commented.
> What's the deal?

These changes are the part of v1 version of this functions. Basically I did incorrect reverting. Thanks for noticing
that, I have to return this comments back.


>
>> @@ -415,19 +420,33 @@ static int pt_update(vaddr_t virt, mfn_t mfn,
>>   
>>       spin_lock(&pt_lock);
>>   
>> -    while ( left )
>> +    /* look at the comment above the definition of PTE_LEAF_SEARCH */
>> +    if ( mfn_eq(mfn, INVALID_MFN) && !(flags & PTE_POPULATE) )
>>       {
>> -        unsigned int order, level;
>> +        flags |= PTE_LEAF_SEARCH;
>> +    }
> For readability I think it would be better if the figure braces were
> dropped.
>
>> -        level = pt_mapping_level(vfn, mfn, left, flags);
>> -        order = XEN_PT_LEVEL_ORDER(level);
>> +    while ( left )
>> +    {
>> +        unsigned int order = 0, level;
>>   
>> -        ASSERT(left >= BIT(order, UL));
>> +        if ( !(flags & PTE_LEAF_SEARCH) )
>> +        {
>> +            level = pt_mapping_level(vfn, mfn, left, flags);
>> +            order = XEN_PT_LEVEL_ORDER(level);
>> +            ASSERT(left >= BIT(order, UL));
> Assignment to order and assertion are ...
>
>> +        }
>>   
>> -        rc = pt_update_entry(root, vfn << PAGE_SHIFT, mfn, level, flags);
>> +        rc = pt_update_entry(root, vfn << PAGE_SHIFT, mfn, &level, flags);
>>           if ( rc )
>>               break;
>>   
>> +        if ( flags & PTE_LEAF_SEARCH )
>> +        {
>> +            order = XEN_PT_LEVEL_ORDER(level);
>> +            ASSERT(left >= BIT(order, UL));
>> +        }
> ... repeated here, with neither left nor order being passed into
> pt_update_entry(). Does this really need doing twice? (I have to
> admit that I have trouble determining what the assertion is about.
> For order alone it clearly could be done centrally after the call.)

Sure, it could be done just once.

Regarding ASSERT() itself it was added to be sure that pt_mapping_level() returns proper `level`.
I am not really sure that it is needed anymore.

~ Oleksii