[PATCH 16/22] x86/mm: introduce a per-CPU L3 table for the per-domain slot

Roger Pau Monne posted 22 patches 1 month, 3 weeks ago
[PATCH 16/22] x86/mm: introduce a per-CPU L3 table for the per-domain slot
Posted by Roger Pau Monne 1 month, 3 weeks ago
So far L4 slot 260 has always been per-domain, in other words: all vCPUs of a
domain share the same L3 entry.  Currently only 3 slots are used in that L3
table, which leaves plenty of room.

Introduce a per-CPU L3 that's used the the domain has Address Space Isolation
enabled.  Such per-CPU L3 gets currently populated using the same L3 entries
present on the per-domain L3 (d->arch.perdomain_l3_pg).

No functional change expected, as the per-CPU L3 is always a copy of the
contents of d->arch.perdomain_l3_pg.

Note that all the per-domain L3 entries are populated at domain create, and
hence there's no need to sync the state of the per-CPU L3 as the domain won't
yet be running when the L3 is modified.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/include/asm/domain.h |  2 +
 xen/arch/x86/include/asm/mm.h     |  4 ++
 xen/arch/x86/mm.c                 | 80 +++++++++++++++++++++++++++++--
 xen/arch/x86/setup.c              |  8 ++++
 xen/arch/x86/smpboot.c            |  4 ++
 5 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index 8c366be8c75f..7620a352b9e3 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -313,6 +313,8 @@ struct arch_domain
 {
     struct page_info *perdomain_l3_pg;
 
+    struct page_info *perdomain_l2_pgs[PERDOMAIN_SLOTS];
+
 #ifdef CONFIG_PV32
     unsigned int hv_compat_vstart;
 #endif
diff --git a/xen/arch/x86/include/asm/mm.h b/xen/arch/x86/include/asm/mm.h
index 2c309f7b1444..34407fb0af06 100644
--- a/xen/arch/x86/include/asm/mm.h
+++ b/xen/arch/x86/include/asm/mm.h
@@ -633,4 +633,8 @@ static inline bool arch_mfns_in_directmap(unsigned long mfn, unsigned long nr)
 /* Setup the per-domain slot in the root page table pointer. */
 void setup_perdomain_slot(const struct vcpu *v, root_pgentry_t *root_pgt);
 
+/* Allocate a per-CPU local L3 table to use in the per-domain slot. */
+int allocate_perdomain_local_l3(unsigned int cpu);
+void free_perdomain_local_l3(unsigned int cpu);
+
 #endif /* __ASM_X86_MM_H__ */
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 13aa15f4db22..1367f3361ffe 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -6079,6 +6079,12 @@ int create_perdomain_mapping(struct domain *d, unsigned long va,
         l2tab = __map_domain_page(pg);
         clear_page(l2tab);
         l3tab[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR_RW);
+        /*
+         * Keep a reference to the per-domain L3 entries in case a per-CPU L3
+         * is in use (as opposed to using perdomain_l3_pg).
+         */
+        ASSERT(!d->creation_finished);
+        d->arch.perdomain_l2_pgs[l3_table_offset(va)] = pg;
     }
     else
         l2tab = map_l2t_from_l3e(l3tab[l3_table_offset(va)]);
@@ -6368,11 +6374,79 @@ unsigned long get_upper_mfn_bound(void)
     return min(max_mfn, 1UL << (paddr_bits - PAGE_SHIFT)) - 1;
 }
 
+static DEFINE_PER_CPU(l3_pgentry_t *, local_l3);
+
+static void populate_perdomain(const struct domain *d, l4_pgentry_t *l4,
+                               l3_pgentry_t *l3)
+{
+    unsigned int i;
+
+    /* Populate the per-CPU L3 with the per-domain entries. */
+    for ( i = 0; i < ARRAY_SIZE(d->arch.perdomain_l2_pgs); i++ )
+    {
+        const struct page_info *pg = d->arch.perdomain_l2_pgs[i];
+
+        BUILD_BUG_ON(ARRAY_SIZE(d->arch.perdomain_l2_pgs) >
+                     L3_PAGETABLE_ENTRIES);
+        l3e_write(&l3[i], pg ? l3e_from_page(pg, __PAGE_HYPERVISOR_RW)
+                             : l3e_empty());
+    }
+
+    l4e_write(&l4[l4_table_offset(PERDOMAIN_VIRT_START)],
+              l4e_from_mfn(virt_to_mfn(l3), __PAGE_HYPERVISOR_RW));
+}
+
+int allocate_perdomain_local_l3(unsigned int cpu)
+{
+    const struct domain *d = idle_vcpu[cpu]->domain;
+    l3_pgentry_t *l3;
+    root_pgentry_t *root_pgt = maddr_to_virt(idle_vcpu[cpu]->arch.cr3);
+
+    ASSERT(!per_cpu(local_l3, cpu));
+
+    if ( !opt_asi_pv && !opt_asi_hvm )
+        return 0;
+
+    l3 = alloc_xenheap_page();
+    if ( !l3 )
+        return -ENOMEM;
+
+    clear_page(l3);
+
+    /* Setup the idle domain slots (current domain) in the L3. */
+    populate_perdomain(d, root_pgt, l3);
+
+    per_cpu(local_l3, cpu) = l3;
+
+    return 0;
+}
+
+void free_perdomain_local_l3(unsigned int cpu)
+{
+    l3_pgentry_t *l3 = per_cpu(local_l3, cpu);
+
+    if ( !l3 )
+        return;
+
+    per_cpu(local_l3, cpu) = NULL;
+    free_xenheap_page(l3);
+}
+
 void setup_perdomain_slot(const struct vcpu *v, root_pgentry_t *root_pgt)
 {
-    l4e_write(&root_pgt[root_table_offset(PERDOMAIN_VIRT_START)],
-              l4e_from_page(v->domain->arch.perdomain_l3_pg,
-                            __PAGE_HYPERVISOR_RW));
+    const struct domain *d = v->domain;
+
+    if ( d->arch.asi )
+    {
+        l3_pgentry_t *l3 = this_cpu(local_l3);
+
+        ASSERT(l3);
+        populate_perdomain(d, root_pgt, l3);
+    }
+    else if ( is_hvm_domain(d) || d->arch.pv.xpti )
+        l4e_write(&root_pgt[root_table_offset(PERDOMAIN_VIRT_START)],
+                  l4e_from_page(v->domain->arch.perdomain_l3_pg,
+                                __PAGE_HYPERVISOR_RW));
 
     if ( !is_pv_64bit_vcpu(v) )
         /*
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index c5a13b30daf4..5bf81b81b46f 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -1961,6 +1961,14 @@ void asmlinkage __init noreturn __start_xen(unsigned long mbi_p)
 
     alternative_branches();
 
+    /*
+     * Setup the local per-domain L3 for the BSP also, so it matches the state
+     * of the APs.
+     */
+    ret = allocate_perdomain_local_l3(0);
+    if ( ret )
+        panic("Error %d setting up local per-domain L3\n", ret);
+
     /*
      * NB: when running as a PV shim VCPUOP_up/down is wired to the shim
      * physical cpu_add/remove functions, so launch the guest with only
diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c
index e07add36b1b6..40cc14799252 100644
--- a/xen/arch/x86/smpboot.c
+++ b/xen/arch/x86/smpboot.c
@@ -986,6 +986,7 @@ static void cpu_smpboot_free(unsigned int cpu, bool remove)
     }
 
     cleanup_cpu_root_pgt(cpu);
+    free_perdomain_local_l3(cpu);
 
     if ( per_cpu(stubs.addr, cpu) )
     {
@@ -1100,6 +1101,9 @@ static int cpu_smpboot_alloc(unsigned int cpu)
     per_cpu(stubs.addr, cpu) = stub_page + STUB_BUF_CPU_OFFS(cpu);
 
     rc = setup_cpu_root_pgt(cpu);
+    if ( rc )
+        goto out;
+    rc = allocate_perdomain_local_l3(cpu);
     if ( rc )
         goto out;
     rc = -ENOMEM;
-- 
2.45.2


Re: [PATCH 16/22] x86/mm: introduce a per-CPU L3 table for the per-domain slot
Posted by Alejandro Vallejo 1 month ago
On Fri Jul 26, 2024 at 4:22 PM BST, Roger Pau Monne wrote:
> So far L4 slot 260 has always been per-domain, in other words: all vCPUs of a
> domain share the same L3 entry.  Currently only 3 slots are used in that L3
> table, which leaves plenty of room.
>
> Introduce a per-CPU L3 that's used the the domain has Address Space Isolation
> enabled.  Such per-CPU L3 gets currently populated using the same L3 entries
> present on the per-domain L3 (d->arch.perdomain_l3_pg).
>
> No functional change expected, as the per-CPU L3 is always a copy of the
> contents of d->arch.perdomain_l3_pg.
>
> Note that all the per-domain L3 entries are populated at domain create, and
> hence there's no need to sync the state of the per-CPU L3 as the domain won't
> yet be running when the L3 is modified.
>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

Still scratching my head with the details on this, but in general I'm utterly
confused whenever I read per-CPU in the series because it's not obvious which
CPU (p or v) I should be thinking about. A general change that would help a lot
is to replace every instance of per-CPU with per-vCPU or per-pCPU as needed.

Cheers,
Alejandro