[v2] x86: adventures in Address Space Isolation

[PATCH v2 15/18] x86/mm: introduce a per-vCPU mapcache when using ASI

Posted by Roger Pau Monne 3 weeks, 6 days ago

When using a unique per-vCPU root page table the per-domain region becomes
per-vCPU, and hence the mapcache is no longer shared between all vCPUs of a
domain.  Introduce per-vCPU mapcache structures, and modify map_domain_page()
to create per-vCPU mappings when possible.  Note the lock is also not needed
with using per-vCPU map caches, as the structure is no longer shared.

This introduces some duplication in the domain and vcpu structures, as both
contain a mapcache field to support running with and without per-vCPU
page-tables.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/domain_page.c        | 90 ++++++++++++++++++++-----------
 xen/arch/x86/include/asm/domain.h | 20 ++++---
 2 files changed, 71 insertions(+), 39 deletions(-)

diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
index 1372be20224e..65900d6218f8 100644
--- a/xen/arch/x86/domain_page.c
+++ b/xen/arch/x86/domain_page.c
@@ -74,7 +74,9 @@ void *map_domain_page(mfn_t mfn)
     struct vcpu *v;
     struct mapcache_domain *dcache;
     struct mapcache_vcpu *vcache;
+    struct mapcache *cache;
     struct vcpu_maphash_entry *hashent;
+    struct domain *d;
 
 #ifdef NDEBUG
     if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
@@ -85,9 +87,12 @@ void *map_domain_page(mfn_t mfn)
     if ( !v || !is_pv_vcpu(v) )
         return mfn_to_virt(mfn_x(mfn));
 
-    dcache = &v->domain->arch.pv.mapcache;
+    d = v->domain;
+    dcache = &d->arch.pv.mapcache;
     vcache = &v->arch.pv.mapcache;
-    if ( !dcache->inuse )
+    cache = d->arch.vcpu_pt ? &v->arch.pv.mapcache.cache
+                            : &d->arch.pv.mapcache.cache;
+    if ( !cache->inuse )
         return mfn_to_virt(mfn_x(mfn));
 
     perfc_incr(map_domain_page_count);
@@ -98,17 +103,18 @@ void *map_domain_page(mfn_t mfn)
     if ( hashent->mfn == mfn_x(mfn) )
     {
         idx = hashent->idx;
-        ASSERT(idx < dcache->entries);
+        ASSERT(idx < cache->entries);
         hashent->refcnt++;
         ASSERT(hashent->refcnt);
         ASSERT(mfn_eq(l1e_get_mfn(MAPCACHE_L1ENT(idx)), mfn));
         goto out;
     }
 
-    spin_lock(&dcache->lock);
+    if ( !d->arch.vcpu_pt )
+        spin_lock(&dcache->lock);
 
     /* Has some other CPU caused a wrap? We must flush if so. */
-    if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
+    if ( unlikely(!d->arch.vcpu_pt && dcache->epoch != vcache->shadow_epoch) )
     {
         vcache->shadow_epoch = dcache->epoch;
         if ( NEED_FLUSH(this_cpu(tlbflush_time), dcache->tlbflush_timestamp) )
@@ -118,21 +124,21 @@ void *map_domain_page(mfn_t mfn)
         }
     }
 
-    idx = find_next_zero_bit(dcache->inuse, dcache->entries, dcache->cursor);
-    if ( unlikely(idx >= dcache->entries) )
+    idx = find_next_zero_bit(cache->inuse, cache->entries, cache->cursor);
+    if ( unlikely(idx >= cache->entries) )
     {
         unsigned long accum = 0, prev = 0;
 
         /* /First/, clean the garbage map and update the inuse list. */
-        for ( i = 0; i < BITS_TO_LONGS(dcache->entries); i++ )
+        for ( i = 0; i < BITS_TO_LONGS(cache->entries); i++ )
         {
             accum |= prev;
-            dcache->inuse[i] &= ~xchg(&dcache->garbage[i], 0);
-            prev = ~dcache->inuse[i];
+            cache->inuse[i] &= ~xchg(&cache->garbage[i], 0);
+            prev = ~cache->inuse[i];
         }
 
-        if ( accum | (prev & BITMAP_LAST_WORD_MASK(dcache->entries)) )
-            idx = find_first_zero_bit(dcache->inuse, dcache->entries);
+        if ( accum | (prev & BITMAP_LAST_WORD_MASK(cache->entries)) )
+            idx = find_first_zero_bit(cache->inuse, cache->entries);
         else
         {
             /* Replace a hash entry instead. */
@@ -152,19 +158,23 @@ void *map_domain_page(mfn_t mfn)
                     i = 0;
             } while ( i != MAPHASH_HASHFN(mfn_x(mfn)) );
         }
-        BUG_ON(idx >= dcache->entries);
+        BUG_ON(idx >= cache->entries);
 
         /* /Second/, flush TLBs. */
         perfc_incr(domain_page_tlb_flush);
         flush_tlb_local();
-        vcache->shadow_epoch = ++dcache->epoch;
-        dcache->tlbflush_timestamp = tlbflush_current_time();
+        if ( !d->arch.vcpu_pt )
+        {
+            vcache->shadow_epoch = ++dcache->epoch;
+            dcache->tlbflush_timestamp = tlbflush_current_time();
+        }
     }
 
-    set_bit(idx, dcache->inuse);
-    dcache->cursor = idx + 1;
+    set_bit(idx, cache->inuse);
+    cache->cursor = idx + 1;
 
-    spin_unlock(&dcache->lock);
+    if ( !d->arch.vcpu_pt )
+        spin_unlock(&dcache->lock);
 
     l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW));
 
@@ -178,6 +188,7 @@ void unmap_domain_page(const void *ptr)
     unsigned int idx;
     struct vcpu *v;
     struct mapcache_domain *dcache;
+    struct mapcache *cache;
     unsigned long va = (unsigned long)ptr, mfn, flags;
     struct vcpu_maphash_entry *hashent;
 
@@ -190,7 +201,9 @@ void unmap_domain_page(const void *ptr)
     ASSERT(v && is_pv_vcpu(v));
 
     dcache = &v->domain->arch.pv.mapcache;
-    ASSERT(dcache->inuse);
+    cache = v->domain->arch.vcpu_pt ? &v->arch.pv.mapcache.cache
+                                    : &v->domain->arch.pv.mapcache.cache;
+    ASSERT(cache->inuse);
 
     idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
     mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx));
@@ -213,7 +226,7 @@ void unmap_domain_page(const void *ptr)
                    hashent->mfn);
             l1e_write(&MAPCACHE_L1ENT(hashent->idx), l1e_empty());
             /* /Second/, mark as garbage. */
-            set_bit(hashent->idx, dcache->garbage);
+            set_bit(hashent->idx, cache->garbage);
         }
 
         /* Add newly-freed mapping to the maphash. */
@@ -225,7 +238,7 @@ void unmap_domain_page(const void *ptr)
         /* /First/, zap the PTE. */
         l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
         /* /Second/, mark as garbage. */
-        set_bit(idx, dcache->garbage);
+        set_bit(idx, cache->garbage);
     }
 
     local_irq_restore(flags);
@@ -234,7 +247,6 @@ void unmap_domain_page(const void *ptr)
 void mapcache_domain_init(struct domain *d)
 {
     struct mapcache_domain *dcache = &d->arch.pv.mapcache;
-    unsigned int bitmap_pages;
 
     ASSERT(is_pv_domain(d));
 
@@ -243,13 +255,12 @@ void mapcache_domain_init(struct domain *d)
         return;
 #endif
 
+    if ( d->arch.vcpu_pt )
+        return;
+
     BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
                  2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
                  MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
-    bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
-    dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
-    dcache->garbage = dcache->inuse +
-                      (bitmap_pages + 1) * PAGE_SIZE / sizeof(long);
 
     spin_lock_init(&dcache->lock);
 }
@@ -258,30 +269,45 @@ int mapcache_vcpu_init(struct vcpu *v)
 {
     struct domain *d = v->domain;
     struct mapcache_domain *dcache = &d->arch.pv.mapcache;
+    struct mapcache *cache;
     unsigned long i;
-    unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES;
+    unsigned int ents = (d->arch.vcpu_pt ? 1 : d->max_vcpus) *
+                        MAPCACHE_VCPU_ENTRIES;
     unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long));
 
-    if ( !is_pv_vcpu(v) || !dcache->inuse )
+    if ( !is_pv_vcpu(v) )
         return 0;
 
-    if ( ents > dcache->entries )
+    cache = d->arch.vcpu_pt ? &v->arch.pv.mapcache.cache
+                            : &dcache->cache;
+
+    if ( !cache->inuse )
+        return 0;
+
+    if ( ents > cache->entries )
     {
         /* Populate page tables. */
         int rc = create_perdomain_mapping(v, MAPCACHE_VIRT_START, ents, false);
+        const unsigned int bitmap_pages =
+            PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
+
+        cache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
+        cache->garbage = cache->inuse +
+                         (bitmap_pages + 1) * PAGE_SIZE / sizeof(long);
+
 
         /* Populate bit maps. */
         if ( !rc )
-            rc = create_perdomain_mapping(v, (unsigned long)dcache->inuse,
+            rc = create_perdomain_mapping(v, (unsigned long)cache->inuse,
                                           nr, true);
         if ( !rc )
-            rc = create_perdomain_mapping(v, (unsigned long)dcache->garbage,
+            rc = create_perdomain_mapping(v, (unsigned long)cache->garbage,
                                           nr, true);
 
         if ( rc )
             return rc;
 
-        dcache->entries = ents;
+        cache->entries = ents;
     }
 
     /* Mark all maphash entries as not in use. */
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index 5bf0ad3fdcf7..ba5440099d90 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -41,6 +41,16 @@ struct trap_bounce {
     unsigned long eip;
 };
 
+struct mapcache {
+    /* The number of array entries, and a cursor into the array. */
+    unsigned int entries;
+    unsigned int cursor;
+
+    /* Which mappings are in use, and which are garbage to reap next epoch? */
+    unsigned long *inuse;
+    unsigned long *garbage;
+};
+
 #define MAPHASH_ENTRIES 8
 #define MAPHASH_HASHFN(pfn) ((pfn) & (MAPHASH_ENTRIES-1))
 #define MAPHASHENT_NOTINUSE ((u32)~0U)
@@ -54,13 +64,11 @@ struct mapcache_vcpu {
         uint32_t      idx;
         uint32_t      refcnt;
     } hash[MAPHASH_ENTRIES];
+
+    struct mapcache cache;
 };
 
 struct mapcache_domain {
-    /* The number of array entries, and a cursor into the array. */
-    unsigned int entries;
-    unsigned int cursor;
-
     /* Protects map_domain_page(). */
     spinlock_t lock;
 
@@ -68,9 +76,7 @@ struct mapcache_domain {
     unsigned int epoch;
     u32 tlbflush_timestamp;
 
-    /* Which mappings are in use, and which are garbage to reap next epoch? */
-    unsigned long *inuse;
-    unsigned long *garbage;
+    struct mapcache cache;
 };
 
 void mapcache_domain_init(struct domain *d);
-- 
2.46.0

Re: [PATCH v2 15/18] x86/mm: introduce a per-vCPU mapcache when using ASI

Posted by Alejandro Vallejo 3 weeks, 5 days ago

On Wed Jan 8, 2025 at 2:26 PM GMT, Roger Pau Monne wrote:
> When using a unique per-vCPU root page table the per-domain region becomes
> per-vCPU, and hence the mapcache is no longer shared between all vCPUs of a
> domain.  Introduce per-vCPU mapcache structures, and modify map_domain_page()
> to create per-vCPU mappings when possible.  Note the lock is also not needed
> with using per-vCPU map caches, as the structure is no longer shared.
>
> This introduces some duplication in the domain and vcpu structures, as both
> contain a mapcache field to support running with and without per-vCPU
> page-tables.
>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
>  xen/arch/x86/domain_page.c        | 90 ++++++++++++++++++++-----------
>  xen/arch/x86/include/asm/domain.h | 20 ++++---
>  2 files changed, 71 insertions(+), 39 deletions(-)
>
> diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
> index 1372be20224e..65900d6218f8 100644
> --- a/xen/arch/x86/domain_page.c
> +++ b/xen/arch/x86/domain_page.c
> @@ -74,7 +74,9 @@ void *map_domain_page(mfn_t mfn)
>      struct vcpu *v;
>      struct mapcache_domain *dcache;
>      struct mapcache_vcpu *vcache;
> +    struct mapcache *cache;
>      struct vcpu_maphash_entry *hashent;
> +    struct domain *d;
>  
>  #ifdef NDEBUG
>      if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> @@ -85,9 +87,12 @@ void *map_domain_page(mfn_t mfn)
>      if ( !v || !is_pv_vcpu(v) )
>          return mfn_to_virt(mfn_x(mfn));
>  
> -    dcache = &v->domain->arch.pv.mapcache;
> +    d = v->domain;
> +    dcache = &d->arch.pv.mapcache;
>      vcache = &v->arch.pv.mapcache;
> -    if ( !dcache->inuse )
> +    cache = d->arch.vcpu_pt ? &v->arch.pv.mapcache.cache
> +                            : &d->arch.pv.mapcache.cache;
> +    if ( !cache->inuse )
>          return mfn_to_virt(mfn_x(mfn));
>  
>      perfc_incr(map_domain_page_count);
> @@ -98,17 +103,18 @@ void *map_domain_page(mfn_t mfn)
>      if ( hashent->mfn == mfn_x(mfn) )
>      {
>          idx = hashent->idx;
> -        ASSERT(idx < dcache->entries);
> +        ASSERT(idx < cache->entries);
>          hashent->refcnt++;
>          ASSERT(hashent->refcnt);
>          ASSERT(mfn_eq(l1e_get_mfn(MAPCACHE_L1ENT(idx)), mfn));
>          goto out;
>      }
>  
> -    spin_lock(&dcache->lock);
> +    if ( !d->arch.vcpu_pt )
> +        spin_lock(&dcache->lock);

Hmmm. I wonder whether we might not want a nospec here...

>  
>      /* Has some other CPU caused a wrap? We must flush if so. */
> -    if ( unlikely(dcache->epoch != vcache->shadow_epoch) )
> +    if ( unlikely(!d->arch.vcpu_pt && dcache->epoch != vcache->shadow_epoch) )
>      {
>          vcache->shadow_epoch = dcache->epoch;
>          if ( NEED_FLUSH(this_cpu(tlbflush_time), dcache->tlbflush_timestamp) )
> @@ -118,21 +124,21 @@ void *map_domain_page(mfn_t mfn)
>          }
>      }
>  
> -    idx = find_next_zero_bit(dcache->inuse, dcache->entries, dcache->cursor);
> -    if ( unlikely(idx >= dcache->entries) )
> +    idx = find_next_zero_bit(cache->inuse, cache->entries, cache->cursor);
> +    if ( unlikely(idx >= cache->entries) )
>      {
>          unsigned long accum = 0, prev = 0;
>  
>          /* /First/, clean the garbage map and update the inuse list. */
> -        for ( i = 0; i < BITS_TO_LONGS(dcache->entries); i++ )
> +        for ( i = 0; i < BITS_TO_LONGS(cache->entries); i++ )
>          {
>              accum |= prev;
> -            dcache->inuse[i] &= ~xchg(&dcache->garbage[i], 0);
> -            prev = ~dcache->inuse[i];
> +            cache->inuse[i] &= ~xchg(&cache->garbage[i], 0);
> +            prev = ~cache->inuse[i];
>          }
>  
> -        if ( accum | (prev & BITMAP_LAST_WORD_MASK(dcache->entries)) )
> -            idx = find_first_zero_bit(dcache->inuse, dcache->entries);
> +        if ( accum | (prev & BITMAP_LAST_WORD_MASK(cache->entries)) )
> +            idx = find_first_zero_bit(cache->inuse, cache->entries);
>          else
>          {
>              /* Replace a hash entry instead. */
> @@ -152,19 +158,23 @@ void *map_domain_page(mfn_t mfn)
>                      i = 0;
>              } while ( i != MAPHASH_HASHFN(mfn_x(mfn)) );
>          }
> -        BUG_ON(idx >= dcache->entries);
> +        BUG_ON(idx >= cache->entries);
>  
>          /* /Second/, flush TLBs. */
>          perfc_incr(domain_page_tlb_flush);
>          flush_tlb_local();
> -        vcache->shadow_epoch = ++dcache->epoch;
> -        dcache->tlbflush_timestamp = tlbflush_current_time();
> +        if ( !d->arch.vcpu_pt )
> +        {
> +            vcache->shadow_epoch = ++dcache->epoch;
> +            dcache->tlbflush_timestamp = tlbflush_current_time();
> +        }
>      }
>  
> -    set_bit(idx, dcache->inuse);
> -    dcache->cursor = idx + 1;
> +    set_bit(idx, cache->inuse);
> +    cache->cursor = idx + 1;
>  
> -    spin_unlock(&dcache->lock);
> +    if ( !d->arch.vcpu_pt )
> +        spin_unlock(&dcache->lock);

... and here.

>  
>      l1e_write(&MAPCACHE_L1ENT(idx), l1e_from_mfn(mfn, __PAGE_HYPERVISOR_RW));
>  
> @@ -178,6 +188,7 @@ void unmap_domain_page(const void *ptr)
>      unsigned int idx;
>      struct vcpu *v;
>      struct mapcache_domain *dcache;
> +    struct mapcache *cache;
>      unsigned long va = (unsigned long)ptr, mfn, flags;
>      struct vcpu_maphash_entry *hashent;
>  
> @@ -190,7 +201,9 @@ void unmap_domain_page(const void *ptr)
>      ASSERT(v && is_pv_vcpu(v));
>  
>      dcache = &v->domain->arch.pv.mapcache;
> -    ASSERT(dcache->inuse);
> +    cache = v->domain->arch.vcpu_pt ? &v->arch.pv.mapcache.cache
> +                                    : &v->domain->arch.pv.mapcache.cache;
> +    ASSERT(cache->inuse);
>  
>      idx = PFN_DOWN(va - MAPCACHE_VIRT_START);
>      mfn = l1e_get_pfn(MAPCACHE_L1ENT(idx));
> @@ -213,7 +226,7 @@ void unmap_domain_page(const void *ptr)
>                     hashent->mfn);
>              l1e_write(&MAPCACHE_L1ENT(hashent->idx), l1e_empty());
>              /* /Second/, mark as garbage. */
> -            set_bit(hashent->idx, dcache->garbage);
> +            set_bit(hashent->idx, cache->garbage);
>          }
>  
>          /* Add newly-freed mapping to the maphash. */
> @@ -225,7 +238,7 @@ void unmap_domain_page(const void *ptr)
>          /* /First/, zap the PTE. */
>          l1e_write(&MAPCACHE_L1ENT(idx), l1e_empty());
>          /* /Second/, mark as garbage. */
> -        set_bit(idx, dcache->garbage);
> +        set_bit(idx, cache->garbage);
>      }
>  
>      local_irq_restore(flags);
> @@ -234,7 +247,6 @@ void unmap_domain_page(const void *ptr)
>  void mapcache_domain_init(struct domain *d)
>  {
>      struct mapcache_domain *dcache = &d->arch.pv.mapcache;
> -    unsigned int bitmap_pages;
>  
>      ASSERT(is_pv_domain(d));
>  
> @@ -243,13 +255,12 @@ void mapcache_domain_init(struct domain *d)
>          return;
>  #endif
>  
> +    if ( d->arch.vcpu_pt )
> +        return;
> +
>      BUILD_BUG_ON(MAPCACHE_VIRT_END + PAGE_SIZE * (3 +
>                   2 * PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long))) >
>                   MAPCACHE_VIRT_START + (PERDOMAIN_SLOT_MBYTES << 20));
> -    bitmap_pages = PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
> -    dcache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
> -    dcache->garbage = dcache->inuse +
> -                      (bitmap_pages + 1) * PAGE_SIZE / sizeof(long);
>  
>      spin_lock_init(&dcache->lock);
>  }
> @@ -258,30 +269,45 @@ int mapcache_vcpu_init(struct vcpu *v)
>  {
>      struct domain *d = v->domain;
>      struct mapcache_domain *dcache = &d->arch.pv.mapcache;
> +    struct mapcache *cache;
>      unsigned long i;
> -    unsigned int ents = d->max_vcpus * MAPCACHE_VCPU_ENTRIES;
> +    unsigned int ents = (d->arch.vcpu_pt ? 1 : d->max_vcpus) *
> +                        MAPCACHE_VCPU_ENTRIES;
>      unsigned int nr = PFN_UP(BITS_TO_LONGS(ents) * sizeof(long));
>  
> -    if ( !is_pv_vcpu(v) || !dcache->inuse )
> +    if ( !is_pv_vcpu(v) )
>          return 0;
>  
> -    if ( ents > dcache->entries )
> +    cache = d->arch.vcpu_pt ? &v->arch.pv.mapcache.cache
> +                            : &dcache->cache;
> +
> +    if ( !cache->inuse )
> +        return 0;
> +
> +    if ( ents > cache->entries )
>      {
>          /* Populate page tables. */
>          int rc = create_perdomain_mapping(v, MAPCACHE_VIRT_START, ents, false);
> +        const unsigned int bitmap_pages =
> +            PFN_UP(BITS_TO_LONGS(MAPCACHE_ENTRIES) * sizeof(long));
> +
> +        cache->inuse = (void *)MAPCACHE_VIRT_END + PAGE_SIZE;
> +        cache->garbage = cache->inuse +
> +                         (bitmap_pages + 1) * PAGE_SIZE / sizeof(long);
> +
>  
>          /* Populate bit maps. */
>          if ( !rc )
> -            rc = create_perdomain_mapping(v, (unsigned long)dcache->inuse,
> +            rc = create_perdomain_mapping(v, (unsigned long)cache->inuse,
>                                            nr, true);
>          if ( !rc )
> -            rc = create_perdomain_mapping(v, (unsigned long)dcache->garbage,
> +            rc = create_perdomain_mapping(v, (unsigned long)cache->garbage,
>                                            nr, true);
>  
>          if ( rc )
>              return rc;
>  
> -        dcache->entries = ents;
> +        cache->entries = ents;
>      }
>  
>      /* Mark all maphash entries as not in use. */

Cheers,
Alejandro

Re: [PATCH v2 15/18] x86/mm: introduce a per-vCPU mapcache when using ASI

Posted by Roger Pau Monné 3 weeks, 4 days ago

On Thu, Jan 09, 2025 at 03:08:15PM +0000, Alejandro Vallejo wrote:
> On Wed Jan 8, 2025 at 2:26 PM GMT, Roger Pau Monne wrote:
> > When using a unique per-vCPU root page table the per-domain region becomes
> > per-vCPU, and hence the mapcache is no longer shared between all vCPUs of a
> > domain.  Introduce per-vCPU mapcache structures, and modify map_domain_page()
> > to create per-vCPU mappings when possible.  Note the lock is also not needed
> > with using per-vCPU map caches, as the structure is no longer shared.
> >
> > This introduces some duplication in the domain and vcpu structures, as both
> > contain a mapcache field to support running with and without per-vCPU
> > page-tables.
> >
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > ---
> >  xen/arch/x86/domain_page.c        | 90 ++++++++++++++++++++-----------
> >  xen/arch/x86/include/asm/domain.h | 20 ++++---
> >  2 files changed, 71 insertions(+), 39 deletions(-)
> >
> > diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
> > index 1372be20224e..65900d6218f8 100644
> > --- a/xen/arch/x86/domain_page.c
> > +++ b/xen/arch/x86/domain_page.c
> > @@ -74,7 +74,9 @@ void *map_domain_page(mfn_t mfn)
> >      struct vcpu *v;
> >      struct mapcache_domain *dcache;
> >      struct mapcache_vcpu *vcache;
> > +    struct mapcache *cache;
> >      struct vcpu_maphash_entry *hashent;
> > +    struct domain *d;
> >  
> >  #ifdef NDEBUG
> >      if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> > @@ -85,9 +87,12 @@ void *map_domain_page(mfn_t mfn)
> >      if ( !v || !is_pv_vcpu(v) )
> >          return mfn_to_virt(mfn_x(mfn));
> >  
> > -    dcache = &v->domain->arch.pv.mapcache;
> > +    d = v->domain;
> > +    dcache = &d->arch.pv.mapcache;
> >      vcache = &v->arch.pv.mapcache;
> > -    if ( !dcache->inuse )
> > +    cache = d->arch.vcpu_pt ? &v->arch.pv.mapcache.cache
> > +                            : &d->arch.pv.mapcache.cache;
> > +    if ( !cache->inuse )
> >          return mfn_to_virt(mfn_x(mfn));
> >  
> >      perfc_incr(map_domain_page_count);
> > @@ -98,17 +103,18 @@ void *map_domain_page(mfn_t mfn)
> >      if ( hashent->mfn == mfn_x(mfn) )
> >      {
> >          idx = hashent->idx;
> > -        ASSERT(idx < dcache->entries);
> > +        ASSERT(idx < cache->entries);
> >          hashent->refcnt++;
> >          ASSERT(hashent->refcnt);
> >          ASSERT(mfn_eq(l1e_get_mfn(MAPCACHE_L1ENT(idx)), mfn));
> >          goto out;
> >      }
> >  
> > -    spin_lock(&dcache->lock);
> > +    if ( !d->arch.vcpu_pt )
> > +        spin_lock(&dcache->lock);
> 
> Hmmm. I wonder whether we might not want a nospec here...

Not sure TBH, we have other instances of conditional locking that
doesn't use nospec().  That said I'm not claiming those are correct.
Shouldn't people that care about this kind of speculation into
critical regions just use CONFIG_SPECULATIVE_HARDEN_LOCK?

Thanks, Roger.

Re: [PATCH v2 15/18] x86/mm: introduce a per-vCPU mapcache when using ASI

Posted by Alejandro Vallejo 3 weeks, 4 days ago

On Fri Jan 10, 2025 at 3:02 PM GMT, Roger Pau Monné wrote:
> On Thu, Jan 09, 2025 at 03:08:15PM +0000, Alejandro Vallejo wrote:
> > On Wed Jan 8, 2025 at 2:26 PM GMT, Roger Pau Monne wrote:
> > > When using a unique per-vCPU root page table the per-domain region becomes
> > > per-vCPU, and hence the mapcache is no longer shared between all vCPUs of a
> > > domain.  Introduce per-vCPU mapcache structures, and modify map_domain_page()
> > > to create per-vCPU mappings when possible.  Note the lock is also not needed
> > > with using per-vCPU map caches, as the structure is no longer shared.
> > >
> > > This introduces some duplication in the domain and vcpu structures, as both
> > > contain a mapcache field to support running with and without per-vCPU
> > > page-tables.
> > >
> > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > ---
> > >  xen/arch/x86/domain_page.c        | 90 ++++++++++++++++++++-----------
> > >  xen/arch/x86/include/asm/domain.h | 20 ++++---
> > >  2 files changed, 71 insertions(+), 39 deletions(-)
> > >
> > > diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
> > > index 1372be20224e..65900d6218f8 100644
> > > --- a/xen/arch/x86/domain_page.c
> > > +++ b/xen/arch/x86/domain_page.c
> > > @@ -74,7 +74,9 @@ void *map_domain_page(mfn_t mfn)
> > >      struct vcpu *v;
> > >      struct mapcache_domain *dcache;
> > >      struct mapcache_vcpu *vcache;
> > > +    struct mapcache *cache;
> > >      struct vcpu_maphash_entry *hashent;
> > > +    struct domain *d;
> > >  
> > >  #ifdef NDEBUG
> > >      if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> > > @@ -85,9 +87,12 @@ void *map_domain_page(mfn_t mfn)
> > >      if ( !v || !is_pv_vcpu(v) )
> > >          return mfn_to_virt(mfn_x(mfn));
> > >  
> > > -    dcache = &v->domain->arch.pv.mapcache;
> > > +    d = v->domain;
> > > +    dcache = &d->arch.pv.mapcache;
> > >      vcache = &v->arch.pv.mapcache;
> > > -    if ( !dcache->inuse )
> > > +    cache = d->arch.vcpu_pt ? &v->arch.pv.mapcache.cache
> > > +                            : &d->arch.pv.mapcache.cache;
> > > +    if ( !cache->inuse )
> > >          return mfn_to_virt(mfn_x(mfn));
> > >  
> > >      perfc_incr(map_domain_page_count);
> > > @@ -98,17 +103,18 @@ void *map_domain_page(mfn_t mfn)
> > >      if ( hashent->mfn == mfn_x(mfn) )
> > >      {
> > >          idx = hashent->idx;
> > > -        ASSERT(idx < dcache->entries);
> > > +        ASSERT(idx < cache->entries);
> > >          hashent->refcnt++;
> > >          ASSERT(hashent->refcnt);
> > >          ASSERT(mfn_eq(l1e_get_mfn(MAPCACHE_L1ENT(idx)), mfn));
> > >          goto out;
> > >      }
> > >  
> > > -    spin_lock(&dcache->lock);
> > > +    if ( !d->arch.vcpu_pt )
> > > +        spin_lock(&dcache->lock);
> > 
> > Hmmm. I wonder whether we might not want a nospec here...
>
> Not sure TBH, we have other instances of conditional locking that
> doesn't use nospec().  That said I'm not claiming those are correct.
> Shouldn't people that care about this kind of speculation into
> critical regions just use CONFIG_SPECULATIVE_HARDEN_LOCK?
>
> Thanks, Roger.

Actually, to avoid the double lfence, I think this would work too while
avoiding the lfence unconditionally when CONFIG_SPECULATIVE_HARDEN_LOCK is not
set.

    if ( !d->arch.vcpu_pt )
        spin_lock(&dcache->lock);
    else
        block_lock_speculation();

Cheers,
Alejandro

Re: [PATCH v2 15/18] x86/mm: introduce a per-vCPU mapcache when using ASI

Posted by Roger Pau Monné 3 weeks, 4 days ago

On Fri, Jan 10, 2025 at 04:19:03PM +0000, Alejandro Vallejo wrote:
> On Fri Jan 10, 2025 at 3:02 PM GMT, Roger Pau Monné wrote:
> > On Thu, Jan 09, 2025 at 03:08:15PM +0000, Alejandro Vallejo wrote:
> > > On Wed Jan 8, 2025 at 2:26 PM GMT, Roger Pau Monne wrote:
> > > > When using a unique per-vCPU root page table the per-domain region becomes
> > > > per-vCPU, and hence the mapcache is no longer shared between all vCPUs of a
> > > > domain.  Introduce per-vCPU mapcache structures, and modify map_domain_page()
> > > > to create per-vCPU mappings when possible.  Note the lock is also not needed
> > > > with using per-vCPU map caches, as the structure is no longer shared.
> > > >
> > > > This introduces some duplication in the domain and vcpu structures, as both
> > > > contain a mapcache field to support running with and without per-vCPU
> > > > page-tables.
> > > >
> > > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > > ---
> > > >  xen/arch/x86/domain_page.c        | 90 ++++++++++++++++++++-----------
> > > >  xen/arch/x86/include/asm/domain.h | 20 ++++---
> > > >  2 files changed, 71 insertions(+), 39 deletions(-)
> > > >
> > > > diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
> > > > index 1372be20224e..65900d6218f8 100644
> > > > --- a/xen/arch/x86/domain_page.c
> > > > +++ b/xen/arch/x86/domain_page.c
> > > > @@ -74,7 +74,9 @@ void *map_domain_page(mfn_t mfn)
> > > >      struct vcpu *v;
> > > >      struct mapcache_domain *dcache;
> > > >      struct mapcache_vcpu *vcache;
> > > > +    struct mapcache *cache;
> > > >      struct vcpu_maphash_entry *hashent;
> > > > +    struct domain *d;
> > > >  
> > > >  #ifdef NDEBUG
> > > >      if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> > > > @@ -85,9 +87,12 @@ void *map_domain_page(mfn_t mfn)
> > > >      if ( !v || !is_pv_vcpu(v) )
> > > >          return mfn_to_virt(mfn_x(mfn));
> > > >  
> > > > -    dcache = &v->domain->arch.pv.mapcache;
> > > > +    d = v->domain;
> > > > +    dcache = &d->arch.pv.mapcache;
> > > >      vcache = &v->arch.pv.mapcache;
> > > > -    if ( !dcache->inuse )
> > > > +    cache = d->arch.vcpu_pt ? &v->arch.pv.mapcache.cache
> > > > +                            : &d->arch.pv.mapcache.cache;
> > > > +    if ( !cache->inuse )
> > > >          return mfn_to_virt(mfn_x(mfn));
> > > >  
> > > >      perfc_incr(map_domain_page_count);
> > > > @@ -98,17 +103,18 @@ void *map_domain_page(mfn_t mfn)
> > > >      if ( hashent->mfn == mfn_x(mfn) )
> > > >      {
> > > >          idx = hashent->idx;
> > > > -        ASSERT(idx < dcache->entries);
> > > > +        ASSERT(idx < cache->entries);
> > > >          hashent->refcnt++;
> > > >          ASSERT(hashent->refcnt);
> > > >          ASSERT(mfn_eq(l1e_get_mfn(MAPCACHE_L1ENT(idx)), mfn));
> > > >          goto out;
> > > >      }
> > > >  
> > > > -    spin_lock(&dcache->lock);
> > > > +    if ( !d->arch.vcpu_pt )
> > > > +        spin_lock(&dcache->lock);
> > > 
> > > Hmmm. I wonder whether we might not want a nospec here...
> >
> > Not sure TBH, we have other instances of conditional locking that
> > doesn't use nospec().  That said I'm not claiming those are correct.
> > Shouldn't people that care about this kind of speculation into
> > critical regions just use CONFIG_SPECULATIVE_HARDEN_LOCK?
> >
> > Thanks, Roger.
> 
> Actually, to avoid the double lfence, I think this would work too while
> avoiding the lfence unconditionally when CONFIG_SPECULATIVE_HARDEN_LOCK is not
> set.
> 
>     if ( !d->arch.vcpu_pt )
>         spin_lock(&dcache->lock);
>     else
>         block_lock_speculation();

We have a spin_lock_if() helper to do that.  I will use it here.

Thanks, Roger.

Re: [PATCH v2 15/18] x86/mm: introduce a per-vCPU mapcache when using ASI

Posted by Alejandro Vallejo 3 weeks, 4 days ago

On Fri Jan 10, 2025 at 3:02 PM GMT, Roger Pau Monné wrote:
> On Thu, Jan 09, 2025 at 03:08:15PM +0000, Alejandro Vallejo wrote:
> > On Wed Jan 8, 2025 at 2:26 PM GMT, Roger Pau Monne wrote:
> > > When using a unique per-vCPU root page table the per-domain region becomes
> > > per-vCPU, and hence the mapcache is no longer shared between all vCPUs of a
> > > domain.  Introduce per-vCPU mapcache structures, and modify map_domain_page()
> > > to create per-vCPU mappings when possible.  Note the lock is also not needed
> > > with using per-vCPU map caches, as the structure is no longer shared.
> > >
> > > This introduces some duplication in the domain and vcpu structures, as both
> > > contain a mapcache field to support running with and without per-vCPU
> > > page-tables.
> > >
> > > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> > > ---
> > >  xen/arch/x86/domain_page.c        | 90 ++++++++++++++++++++-----------
> > >  xen/arch/x86/include/asm/domain.h | 20 ++++---
> > >  2 files changed, 71 insertions(+), 39 deletions(-)
> > >
> > > diff --git a/xen/arch/x86/domain_page.c b/xen/arch/x86/domain_page.c
> > > index 1372be20224e..65900d6218f8 100644
> > > --- a/xen/arch/x86/domain_page.c
> > > +++ b/xen/arch/x86/domain_page.c
> > > @@ -74,7 +74,9 @@ void *map_domain_page(mfn_t mfn)
> > >      struct vcpu *v;
> > >      struct mapcache_domain *dcache;
> > >      struct mapcache_vcpu *vcache;
> > > +    struct mapcache *cache;
> > >      struct vcpu_maphash_entry *hashent;
> > > +    struct domain *d;
> > >  
> > >  #ifdef NDEBUG
> > >      if ( mfn_x(mfn) <= PFN_DOWN(__pa(HYPERVISOR_VIRT_END - 1)) )
> > > @@ -85,9 +87,12 @@ void *map_domain_page(mfn_t mfn)
> > >      if ( !v || !is_pv_vcpu(v) )
> > >          return mfn_to_virt(mfn_x(mfn));
> > >  
> > > -    dcache = &v->domain->arch.pv.mapcache;
> > > +    d = v->domain;
> > > +    dcache = &d->arch.pv.mapcache;
> > >      vcache = &v->arch.pv.mapcache;
> > > -    if ( !dcache->inuse )
> > > +    cache = d->arch.vcpu_pt ? &v->arch.pv.mapcache.cache
> > > +                            : &d->arch.pv.mapcache.cache;
> > > +    if ( !cache->inuse )
> > >          return mfn_to_virt(mfn_x(mfn));
> > >  
> > >      perfc_incr(map_domain_page_count);
> > > @@ -98,17 +103,18 @@ void *map_domain_page(mfn_t mfn)
> > >      if ( hashent->mfn == mfn_x(mfn) )
> > >      {
> > >          idx = hashent->idx;
> > > -        ASSERT(idx < dcache->entries);
> > > +        ASSERT(idx < cache->entries);
> > >          hashent->refcnt++;
> > >          ASSERT(hashent->refcnt);
> > >          ASSERT(mfn_eq(l1e_get_mfn(MAPCACHE_L1ENT(idx)), mfn));
> > >          goto out;
> > >      }
> > >  
> > > -    spin_lock(&dcache->lock);
> > > +    if ( !d->arch.vcpu_pt )
> > > +        spin_lock(&dcache->lock);
> > 
> > Hmmm. I wonder whether we might not want a nospec here...
>
> Not sure TBH, we have other instances of conditional locking that
> doesn't use nospec().  That said I'm not claiming those are correct.
> Shouldn't people that care about this kind of speculation into
> critical regions just use CONFIG_SPECULATIVE_HARDEN_LOCK?

Do people that care have a choice though? CONFIG_SPECULATIVE_HARDEN_LOCK only
blocks speculation in the taken branch here, so the critical region isn't
hardened when the relaxed branch is followed.

I suspect nospec in the condition would be fine perf-wise because the CPU can
still do straight-line-speculation on the underlying function call when
CONFIG_SPECULATIVE_HARDEN_LOCK is not defined.

It's not the end of the world either way.

>
> Thanks, Roger.

Cheers,
Alejandro