[PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap

Elias El Yandouzi posted 15 patches 1 week, 4 days ago
[PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap
Posted by Elias El Yandouzi 1 week, 4 days ago
From: Hongyan Xia <hongyxia@amazon.com>

When mfn is not in direct map, never use mfn_to_virt for any mappings.

We replace mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) with
arch_mfns_in_direct_map(mfn, 1) because these two are equivalent. The
extra comparison in arch_mfns_in_direct_map() looks different but because
DIRECTMAP_VIRT_END is always higher, it does not make any difference.

Lastly, domain_page_map_to_mfn() needs to gain to a special case for
the PMAP.

Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
Signed-off-by: Julien Grall <jgrall@amazon.com>

----

    Changes in v4:
        * Introduce helper functions virt_is_fixmap and virt_in_fixmap_range

    Changes since Hongyan's version:
        * arch_mfn_in_direct_map() was renamed to arch_mfns_in_directmap()
        * add a special case for the PMAP in domain_page_map_to_mfn()

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 55e337aaf703..df7d4750ef05 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -14,8 +14,10 @@
 #include <xen/sched.h>
 #include <xen/vmap.h>
 #include <asm/current.h>
+#include <asm/fixmap.h>
 #include <asm/flushtlb.h>
 #include <asm/hardirq.h>
+#include <asm/pmap.h>
 #include <asm/setup.h>
 
 static DEFINE_PER_CPU(struct vcpu *, override);
@@ -24,6 +26,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
 {
     /* In the common case we use the mapcache of the running VCPU. */
     struct vcpu *v = this_cpu(override) ?: current;
+    struct vcpu *idle_v = idle_vcpu[smp_processor_id()];
 
     /*
      * When current isn't properly set up yet, this is equivalent to
@@ -35,10 +38,11 @@ static inline struct vcpu *mapcache_current_vcpu(void)
     /*
      * When using efi runtime page tables, we have the equivalent of the idle
      * domain's page tables but current may point at another domain's VCPU.
-     * Return NULL as though current is not properly set up yet.
+     * Return the idle domains's vcpu on that core because the efi per-domain
+     * region (where the mapcache is) is in-sync with the idle domain.
      */
     if ( efi_rs_using_pgtables() )
-        return NULL;
+        return idle_v;
 
     /*
      * If guest_table is NULL, and we are running a paravirtualised guest,
@@ -48,7 +52,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
     if ( unlikely(pagetable_is_null(v->arch.guest_table)) && is_pv_vcpu(v) )
     {
         /* If we really are idling, perform lazy context switch now. */
-        if ( (v = idle_vcpu[smp_processor_id()]) == current )
+        if ( (v = idle_v) == current )
             sync_local_execstate();
         /* We must now be running on the idle page table. */
         ASSERT(cr3_pa(read_cr3()) == __pa(idle_pg_table));
@@ -77,18 +81,24 @@ void *map_domain_page(mfn_t mfn)
     struct vcpu_maphash_entry *hashent;
 
 #ifdef NDEBUG
-    if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+    if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
         return mfn_to_virt(mfn_x(mfn));
 #endif
 
     v = mapcache_current_vcpu();
-    if ( !v )
-        return mfn_to_virt(mfn_x(mfn));
+    if ( !v || !v->domain->arch.mapcache.inuse )
+    {
+        if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
+            return mfn_to_virt(mfn_x(mfn));
+        else
+        {
+            BUG_ON(system_state >= SYS_STATE_smp_boot);
+            return pmap_map(mfn);
+        }
+    }
 
     dcache = &v->domain->arch.mapcache;
     vcache = &v->arch.mapcache;
-    if ( !dcache->inuse )
-        return mfn_to_virt(mfn_x(mfn));
 
     perfc_incr(map_domain_page_count);
 
@@ -184,6 +194,12 @@ void unmap_domain_page(const void *ptr)
     if ( !va || va >= DIRECTMAP_VIRT_START )
         return;
 
+    if ( virt_is_fixmap(va) )
+    {
+        pmap_unmap(ptr);
+        return;
+    }
+
     ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
 
     v = mapcache_current_vcpu();
@@ -237,7 +253,7 @@ int mapcache_domain_init(struct domain *d)
     unsigned int bitmap_pages;
 
 #ifdef NDEBUG
-    if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+    if ( !mem_hotplug && arch_mfn_in_directmap(0, max_page) )
         return 0;
 #endif
 
@@ -308,7 +324,7 @@ void *map_domain_page_global(mfn_t mfn)
             local_irq_is_enabled()));
 
 #ifdef NDEBUG
-    if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
+    if ( arch_mfn_in_directmap(mfn_x(mfn, 1)) )
         return mfn_to_virt(mfn_x(mfn));
 #endif
 
@@ -335,6 +351,22 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
     if ( va >= DIRECTMAP_VIRT_START )
         return _mfn(virt_to_mfn(ptr));
 
+    /*
+     * The fixmap is stealing the top-end of the VMAP. So the check for
+     * the PMAP *must* happen first.
+     *
+     * Also, the fixmap translate a slot to an address backwards. The
+     * logic will rely on it to avoid any complexity. So check at
+     * compile time this will always hold.
+    */
+    BUILD_BUG_ON(fix_to_virt(FIX_PMAP_BEGIN) < fix_to_virt(FIX_PMAP_END));
+
+    if ( virt_in_fixmap_range(va, FIX_PMAP_BEGIN, FIX_PMAP_END) )
+    {
+        BUG_ON(system_state >= SYS_STATE_smp_boot);
+        return l1e_get_mfn(l1_fixmap[l1_table_offset(va)]);
+    }
+
     if ( va >= VMAP_VIRT_START && va < VMAP_VIRT_END )
         return vmap_to_mfn(va);
 
diff --git a/xen/arch/x86/include/asm/fixmap.h b/xen/arch/x86/include/asm/fixmap.h
index 80b7b74fd816..381c95a8b11f 100644
--- a/xen/arch/x86/include/asm/fixmap.h
+++ b/xen/arch/x86/include/asm/fixmap.h
@@ -101,6 +101,31 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
     return __virt_to_fix(vaddr);
 }
 
+static inline bool virt_is_fixmap(const unsigned long vaddr)
+{
+    return vaddr >= FIXADDR_START && vaddr < FIXADDR_TOP;
+}
+
+static inline bool virt_in_fixmap_range(
+    const unsigned long vaddr,
+    const unsigned int start_idx,
+    const unsigned int end_idx
+)
+{
+    unsigned long start_addr = (unsigned long)fix_to_virt(start_idx);
+    unsigned long end_addr = (unsigned long)fix_to_virt(end_idx);
+
+    /*
+     * The check ensures that the virtual address (vaddr) is within the
+     * fixmap range. The addresses are allocated backwards, meaning the
+     * start address is higher than the end address. As a result, the
+     * check ensures that the virtual address is greater than or equal to
+     * the end address, and less than or equal to the start address, which
+     * may appear counterintuitive due to the reverse allocation order.
+     */
+    return ((vaddr & PAGE_MASK) <= start_addr) && (vaddr >= end_addr);
+}
+
 enum fixed_addresses_x {
     /* Index 0 is reserved since fix_x_to_virt(0) == FIXADDR_X_TOP. */
     FIX_X_RESERVED,
-- 
2.40.1
Re: [PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap
Posted by Jan Beulich 4 days, 2 hours ago
On 11.11.2024 14:11, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
> 
> When mfn is not in direct map, never use mfn_to_virt for any mappings.
> 
> We replace mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) with
> arch_mfns_in_direct_map(mfn, 1) because these two are equivalent. The
> extra comparison in arch_mfns_in_direct_map() looks different but because
> DIRECTMAP_VIRT_END is always higher, it does not make any difference.
> 
> Lastly, domain_page_map_to_mfn() needs to gain to a special case for
> the PMAP.
> 
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
> 
> ----

Just to mention it (noticed while reading Alejandro's reply, and I didn't
check the rest of the series): This is lacking your S-o-b.

Jan
Re: [PATCH V4 07/15] x86/domain_page: Remove the fast paths when mfn is not in the directmap
Posted by Alejandro Vallejo 4 days, 16 hours ago
I'm still headscratching about various things, but the build errors are on
release builds without pmap enabled. I'm highlighted them here.

On Mon Nov 11, 2024 at 1:11 PM GMT, Elias El Yandouzi wrote:
> From: Hongyan Xia <hongyxia@amazon.com>
>
> When mfn is not in direct map, never use mfn_to_virt for any mappings.
>
> We replace mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) with
> arch_mfns_in_direct_map(mfn, 1) because these two are equivalent. The
> extra comparison in arch_mfns_in_direct_map() looks different but because
> DIRECTMAP_VIRT_END is always higher, it does not make any difference.
>
> Lastly, domain_page_map_to_mfn() needs to gain to a special case for
> the PMAP.
>
> Signed-off-by: Hongyan Xia <hongyxia@amazon.com>
> Signed-off-by: Julien Grall <jgrall@amazon.com>
>
> ----
>
>     Changes in v4:
>         * Introduce helper functions virt_is_fixmap and virt_in_fixmap_range
>
>     Changes since Hongyan's version:
>         * arch_mfn_in_direct_map() was renamed to arch_mfns_in_directmap()
>         * add a special case for the PMAP in domain_page_map_to_mfn()
>
> diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
> index 55e337aaf703..df7d4750ef05 100644
> --- a/xen/arch/x86/domain_page.c
> +++ b/xen/arch/x86/domain_page.c
> @@ -14,8 +14,10 @@
>  #include <xen/sched.h>
>  #include <xen/vmap.h>
>  #include <asm/current.h>
> +#include <asm/fixmap.h>
>  #include <asm/flushtlb.h>
>  #include <asm/hardirq.h>
> +#include <asm/pmap.h>
>  #include <asm/setup.h>
>  
>  static DEFINE_PER_CPU(struct vcpu *, override);
> @@ -24,6 +26,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
>  {
>      /* In the common case we use the mapcache of the running VCPU. */
>      struct vcpu *v = this_cpu(override) ?: current;
> +    struct vcpu *idle_v = idle_vcpu[smp_processor_id()];
>  
>      /*
>       * When current isn't properly set up yet, this is equivalent to
> @@ -35,10 +38,11 @@ static inline struct vcpu *mapcache_current_vcpu(void)
>      /*
>       * When using efi runtime page tables, we have the equivalent of the idle
>       * domain's page tables but current may point at another domain's VCPU.
> -     * Return NULL as though current is not properly set up yet.
> +     * Return the idle domains's vcpu on that core because the efi per-domain
> +     * region (where the mapcache is) is in-sync with the idle domain.
>       */
>      if ( efi_rs_using_pgtables() )
> -        return NULL;
> +        return idle_v;
>  
>      /*
>       * If guest_table is NULL, and we are running a paravirtualised guest,
> @@ -48,7 +52,7 @@ static inline struct vcpu *mapcache_current_vcpu(void)
>      if ( unlikely(pagetable_is_null(v->arch.guest_table)) && is_pv_vcpu(v) )
>      {
>          /* If we really are idling, perform lazy context switch now. */
> -        if ( (v = idle_vcpu[smp_processor_id()]) == current )
> +        if ( (v = idle_v) == current )
>              sync_local_execstate();
>          /* We must now be running on the idle page table. */
>          ASSERT(cr3_pa(read_cr3()) == __pa(idle_pg_table));
> @@ -77,18 +81,24 @@ void *map_domain_page(mfn_t mfn)
>      struct vcpu_maphash_entry *hashent;
>  
>  #ifdef NDEBUG
> -    if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> +    if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
>          return mfn_to_virt(mfn_x(mfn));
>  #endif
>  
>      v = mapcache_current_vcpu();
> -    if ( !v )
> -        return mfn_to_virt(mfn_x(mfn));
> +    if ( !v || !v->domain->arch.mapcache.inuse )
> +    {
> +        if ( arch_mfns_in_directmap(mfn_x(mfn), 1) )
> +            return mfn_to_virt(mfn_x(mfn));
> +        else
> +        {
> +            BUG_ON(system_state >= SYS_STATE_smp_boot);

Missing CONFIG_HAS_PMAP guards around this return. Without it this wants to
BUG(), I think. I'm not entirely convinced the current logic takes into account
the extended directmap present in HVM and idle vCPUs though.

arch_mfns_in_directmap() merely checks they fit in DIRECTMAP_SIZE, doesn't it?

> +            return pmap_map(mfn);
> +        }
> +    }
>  
>      dcache = &v->domain->arch.mapcache;
>      vcache = &v->arch.mapcache;
> -    if ( !dcache->inuse )
> -        return mfn_to_virt(mfn_x(mfn));
>  
>      perfc_incr(map_domain_page_count);
>  
> @@ -184,6 +194,12 @@ void unmap_domain_page(const void *ptr)
>      if ( !va || va >= DIRECTMAP_VIRT_START )
>          return;
>  
> +    if ( virt_is_fixmap(va) )
> +    {
> +        pmap_unmap(ptr);
> +        return;
> +    }
> +

This hunk is also missing CONFIG_HAS_PMAP guards.

>      ASSERT(va >= MAPCACHE_VIRT_START && va < MAPCACHE_VIRT_END);
>  
>      v = mapcache_current_vcpu();
> @@ -237,7 +253,7 @@ int mapcache_domain_init(struct domain *d)
>      unsigned int bitmap_pages;
>  
>  #ifdef NDEBUG
> -    if ( !mem_hotplug && max_page <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> +    if ( !mem_hotplug && arch_mfn_in_directmap(0, max_page) )

I suspect you wanted arch_mfns_in_directmap() rather than _mfn_

>          return 0;
>  #endif
>  
> @@ -308,7 +324,7 @@ void *map_domain_page_global(mfn_t mfn)
>              local_irq_is_enabled()));
>  
>  #ifdef NDEBUG
> -    if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> +    if ( arch_mfn_in_directmap(mfn_x(mfn, 1)) )

I suspect you wanted 's/mfn_x(mfn, 1)/mfn_x(mfn), 1/' instead?

>          return mfn_to_virt(mfn_x(mfn));
>  #endif
>  
> @@ -335,6 +351,22 @@ mfn_t domain_page_map_to_mfn(const void *ptr)
>      if ( va >= DIRECTMAP_VIRT_START )
>          return _mfn(virt_to_mfn(ptr));
>  
> +    /*
> +     * The fixmap is stealing the top-end of the VMAP. So the check for
> +     * the PMAP *must* happen first.
> +     *
> +     * Also, the fixmap translate a slot to an address backwards. The
> +     * logic will rely on it to avoid any complexity. So check at
> +     * compile time this will always hold.
> +    */
> +    BUILD_BUG_ON(fix_to_virt(FIX_PMAP_BEGIN) < fix_to_virt(FIX_PMAP_END));
> +
> +    if ( virt_in_fixmap_range(va, FIX_PMAP_BEGIN, FIX_PMAP_END) )
> +    {
> +        BUG_ON(system_state >= SYS_STATE_smp_boot);
> +        return l1e_get_mfn(l1_fixmap[l1_table_offset(va)]);
> +    }
> +

This hunk should be surrounded by CONFIG_HAS_PMAP guards or it'll fail to
compile.

>      if ( va >= VMAP_VIRT_START && va < VMAP_VIRT_END )
>          return vmap_to_mfn(va);
>  
> diff --git a/xen/arch/x86/include/asm/fixmap.h b/xen/arch/x86/include/asm/fixmap.h
> index 80b7b74fd816..381c95a8b11f 100644
> --- a/xen/arch/x86/include/asm/fixmap.h
> +++ b/xen/arch/x86/include/asm/fixmap.h
> @@ -101,6 +101,31 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
>      return __virt_to_fix(vaddr);
>  }
>  
> +static inline bool virt_is_fixmap(const unsigned long vaddr)
> +{
> +    return vaddr >= FIXADDR_START && vaddr < FIXADDR_TOP;
> +}
> +
> +static inline bool virt_in_fixmap_range(
> +    const unsigned long vaddr,
> +    const unsigned int start_idx,
> +    const unsigned int end_idx
> +)
> +{
> +    unsigned long start_addr = (unsigned long)fix_to_virt(start_idx);
> +    unsigned long end_addr = (unsigned long)fix_to_virt(end_idx);
> +
> +    /*
> +     * The check ensures that the virtual address (vaddr) is within the
> +     * fixmap range. The addresses are allocated backwards, meaning the
> +     * start address is higher than the end address. As a result, the
> +     * check ensures that the virtual address is greater than or equal to
> +     * the end address, and less than or equal to the start address, which
> +     * may appear counterintuitive due to the reverse allocation order.
> +     */
> +    return ((vaddr & PAGE_MASK) <= start_addr) && (vaddr >= end_addr);
> +}
> +
>  enum fixed_addresses_x {
>      /* Index 0 is reserved since fix_x_to_virt(0) == FIXADDR_X_TOP. */
>      FIX_X_RESERVED,

Cheers,
Alejandro