[PATCH v3 0/2] VMX: apic access page handling adjustments

Jan Beulich posted 2 patches 3 years, 2 months ago
Test gitlab-ci passed
Failed in applying to current master (apply log)
[PATCH v3 0/2] VMX: apic access page handling adjustments
Posted by Jan Beulich 3 years, 2 months ago
The latter of the two changes was on my mental todo list for a
very long time. With Julien reporting a problem with the handling
of this page, I finally felt urged to make a patch. As it turns
out, for addressing this problem only the first of the now split
pages is needed, and the second can be further discussed and
considered for 4.16.

1: delay p2m insertion of APIC access page
2: use a single, global APIC access page

Jan

[PATCH v4 0/3] VMX APIC access page and shadow adjustments
Posted by Jan Beulich 3 years ago
1: VMX: use a single, global APIC access page
2: x86/shadow: re-use variables in shadow_get_page_from_l1e()
3: x86/shadow: streamline shadow_get_page_from_l1e()

Jan

Really v5 (was: [PATCH v4 0/3] VMX APIC access page and shadow adjustments)
Posted by Jan Beulich 3 years ago
On 23.04.2021 12:51, Jan Beulich wrote:
> 1: VMX: use a single, global APIC access page
> 2: x86/shadow: re-use variables in shadow_get_page_from_l1e()
> 3: x86/shadow: streamline shadow_get_page_from_l1e()

I'm sorry, I'm noticing only now that I've typoed the version.

Jan

[PATCH v4 1/3] VMX: use a single, global APIC access page
Posted by Jan Beulich 3 years ago
The address of this page is used by the CPU only to recognize when to
access the virtual APIC page instead. No accesses would ever go to this
page. It only needs to be present in the (CPU) page tables so that
address translation will produce its address as result for respective
accesses.

By making this page global, we also eliminate the need to refcount it,
or to assign it to any domain in the first place.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: Init apic_access_mfn to INVALID_MFN. Move assignment out of if()
    condition. Introduce page_suppress_refcounting() and
    page_refcounting_suppressed().
v4: Set PGC_extra on the page. Make shadow mode work.
v3: Split p2m insertion change to a separate patch.
v2: Avoid insertion when !has_vlapic(). Split off change to
    p2m_get_iommu_flags().
---
I did further consider not allocating any real page at all, but just
using the address of some unpopulated space (which would require
announcing this page as reserved to Dom0, so it wouldn't put any PCI
MMIO BARs there). But I thought this would be too controversial, because
of the possible risks associated with this.

--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -66,8 +66,7 @@ boolean_param("force-ept", opt_force_ept
 static void vmx_ctxt_switch_from(struct vcpu *v);
 static void vmx_ctxt_switch_to(struct vcpu *v);
 
-static int  vmx_alloc_vlapic_mapping(struct domain *d);
-static void vmx_free_vlapic_mapping(struct domain *d);
+static int alloc_vlapic_mapping(void);
 static void vmx_install_vlapic_mapping(struct vcpu *v);
 static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr,
                                 unsigned int flags);
@@ -78,6 +77,8 @@ static int vmx_msr_read_intercept(unsign
 static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content);
 static void vmx_invlpg(struct vcpu *v, unsigned long linear);
 
+static mfn_t __read_mostly apic_access_mfn = INVALID_MFN_INITIALIZER;
+
 /* Values for domain's ->arch.hvm_domain.pi_ops.flags. */
 #define PI_CSW_FROM (1u << 0)
 #define PI_CSW_TO   (1u << 1)
@@ -401,7 +402,6 @@ static int vmx_domain_initialise(struct
         .to   = vmx_ctxt_switch_to,
         .tail = vmx_do_resume,
     };
-    int rc;
 
     d->arch.ctxt_switch = &csw;
 
@@ -411,28 +411,22 @@ static int vmx_domain_initialise(struct
      */
     d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp;
 
-    if ( !has_vlapic(d) )
-        return 0;
-
-    if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
-        return rc;
-
     return 0;
 }
 
-static void vmx_domain_relinquish_resources(struct domain *d)
+static void domain_creation_finished(struct domain *d)
 {
-    if ( !has_vlapic(d) )
+    gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE);
+    uint8_t ipat;
+
+    if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, INVALID_MFN) )
         return;
 
-    vmx_free_vlapic_mapping(d);
-}
+    ASSERT(epte_get_entry_emt(d, gfn_x(gfn), apic_access_mfn, 0, &ipat,
+                              true) == MTRR_TYPE_WRBACK);
+    ASSERT(ipat);
 
-static void domain_creation_finished(struct domain *d)
-{
-    if ( has_vlapic(d) && !mfn_eq(d->arch.hvm.vmx.apic_access_mfn, _mfn(0)) &&
-         set_mmio_p2m_entry(d, gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE),
-                            d->arch.hvm.vmx.apic_access_mfn, PAGE_ORDER_4K) )
+    if ( set_mmio_p2m_entry(d, gfn, apic_access_mfn, PAGE_ORDER_4K) )
         domain_crash(d);
 }
 
@@ -2415,7 +2409,6 @@ static struct hvm_function_table __initd
     .cpu_up_prepare       = vmx_cpu_up_prepare,
     .cpu_dead             = vmx_cpu_dead,
     .domain_initialise    = vmx_domain_initialise,
-    .domain_relinquish_resources = vmx_domain_relinquish_resources,
     .domain_creation_finished = domain_creation_finished,
     .vcpu_initialise      = vmx_vcpu_initialise,
     .vcpu_destroy         = vmx_vcpu_destroy,
@@ -2662,7 +2655,7 @@ const struct hvm_function_table * __init
 {
     set_in_cr4(X86_CR4_VMXE);
 
-    if ( vmx_vmcs_init() )
+    if ( vmx_vmcs_init() || alloc_vlapic_mapping() )
     {
         printk("VMX: failed to initialise.\n");
         return NULL;
@@ -3224,7 +3217,7 @@ gp_fault:
     return X86EMUL_EXCEPTION;
 }
 
-static int vmx_alloc_vlapic_mapping(struct domain *d)
+static int __init alloc_vlapic_mapping(void)
 {
     struct page_info *pg;
     mfn_t mfn;
@@ -3232,52 +3225,34 @@ static int vmx_alloc_vlapic_mapping(stru
     if ( !cpu_has_vmx_virtualize_apic_accesses )
         return 0;
 
-    pg = alloc_domheap_page(d, MEMF_no_refcount);
+    pg = alloc_domheap_page(NULL, 0);
     if ( !pg )
         return -ENOMEM;
 
-    if ( !get_page_and_type(pg, d, PGT_writable_page) )
-    {
-        /*
-         * The domain can't possibly know about this page yet, so failure
-         * here is a clear indication of something fishy going on.
-         */
-        domain_crash(d);
-        return -ENODATA;
-    }
+    /*
+     * Signal to shadow code that this page cannot be refcounted. This also
+     * makes epte_get_entry_emt() recognize this page as "special".
+     */
+    page_suppress_refcounting(pg);
 
     mfn = page_to_mfn(pg);
     clear_domain_page(mfn);
-    d->arch.hvm.vmx.apic_access_mfn = mfn;
+    apic_access_mfn = mfn;
 
     return 0;
 }
 
-static void vmx_free_vlapic_mapping(struct domain *d)
-{
-    mfn_t mfn = d->arch.hvm.vmx.apic_access_mfn;
-
-    d->arch.hvm.vmx.apic_access_mfn = _mfn(0);
-    if ( !mfn_eq(mfn, _mfn(0)) )
-    {
-        struct page_info *pg = mfn_to_page(mfn);
-
-        put_page_alloc_ref(pg);
-        put_page_and_type(pg);
-    }
-}
-
 static void vmx_install_vlapic_mapping(struct vcpu *v)
 {
     paddr_t virt_page_ma, apic_page_ma;
 
-    if ( mfn_eq(v->domain->arch.hvm.vmx.apic_access_mfn, _mfn(0)) )
+    if ( !has_vlapic(v->domain) || mfn_eq(apic_access_mfn, INVALID_MFN) )
         return;
 
     ASSERT(cpu_has_vmx_virtualize_apic_accesses);
 
     virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
-    apic_page_ma = mfn_to_maddr(v->domain->arch.hvm.vmx.apic_access_mfn);
+    apic_page_ma = mfn_to_maddr(apic_access_mfn);
 
     vmx_vmcs_enter(v);
     __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
--- a/xen/arch/x86/mm/shadow/set.c
+++ b/xen/arch/x86/mm/shadow/set.c
@@ -94,6 +94,15 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
     ASSERT(!sh_l1e_is_magic(sl1e));
     ASSERT(shadow_mode_refcounts(d));
 
+    /*
+     * Check whether refcounting is suppressed on this page. For example,
+     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
+     * get accessed, and hence there's no need to refcount it.
+     */
+    mfn = shadow_l1e_get_mfn(sl1e);
+    if ( mfn_valid(mfn) && page_refcounting_suppressed(mfn_to_page(mfn)) )
+        return 0;
+
     res = get_page_from_l1e(sl1e, d, d);
 
     /*
--- a/xen/arch/x86/mm/shadow/types.h
+++ b/xen/arch/x86/mm/shadow/types.h
@@ -276,9 +276,16 @@ int shadow_set_l4e(struct domain *d, sha
 static void inline
 shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
 {
+    mfn_t mfn;
+
     if ( !shadow_mode_refcounts(d) )
         return;
 
+    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
+         /* See the respective comment in shadow_get_page_from_l1e(). */
+         page_refcounting_suppressed(mfn_to_page(mfn)) )
+        return;
+
     put_page_from_l1e(sl1e, d);
 }
 
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -58,7 +58,6 @@ struct ept_data {
 #define _VMX_DOMAIN_PML_ENABLED    0
 #define VMX_DOMAIN_PML_ENABLED     (1ul << _VMX_DOMAIN_PML_ENABLED)
 struct vmx_domain {
-    mfn_t apic_access_mfn;
     /* VMX_DOMAIN_* */
     unsigned int status;
 
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -82,7 +82,7 @@
 #define PGC_state_offlined PG_mask(2, 9)
 #define PGC_state_free    PG_mask(3, 9)
 #define page_state_is(pg, st) (((pg)->count_info&PGC_state) == PGC_state_##st)
-/* Page is not reference counted */
+/* Page is not reference counted (see below for caveats) */
 #define _PGC_extra        PG_shift(10)
 #define PGC_extra         PG_mask(1, 10)
 
@@ -374,6 +374,24 @@ void zap_ro_mpt(mfn_t mfn);
 
 bool is_iomem_page(mfn_t mfn);
 
+/*
+ * Pages with no owner which may get passed to functions wanting to
+ * refcount them can be marked PGC_extra to bypass this refcounting (which
+ * would fail due to the lack of an owner).
+ *
+ * (For pages with owner PGC_extra has different meaning.)
+ */
+static inline void page_suppress_refcounting(struct page_info *pg)
+{
+   ASSERT(!page_get_owner(pg));
+   pg->count_info |= PGC_extra;
+}
+
+static inline bool page_refcounting_suppressed(const struct page_info *pg)
+{
+    return !page_get_owner(pg) && (pg->count_info & PGC_extra);
+}
+
 struct platform_bad_page {
     unsigned long mfn;
     unsigned int order;


Re: [PATCH v4 1/3] VMX: use a single, global APIC access page
Posted by Roger Pau Monné 3 years ago
On Fri, Apr 23, 2021 at 12:52:57PM +0200, Jan Beulich wrote:
> --- a/xen/arch/x86/mm/shadow/set.c
> +++ b/xen/arch/x86/mm/shadow/set.c
> @@ -94,6 +94,15 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
>      ASSERT(!sh_l1e_is_magic(sl1e));
>      ASSERT(shadow_mode_refcounts(d));
>  
> +    /*
> +     * Check whether refcounting is suppressed on this page. For example,
> +     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
> +     * get accessed, and hence there's no need to refcount it.
> +     */
> +    mfn = shadow_l1e_get_mfn(sl1e);
> +    if ( mfn_valid(mfn) && page_refcounting_suppressed(mfn_to_page(mfn)) )
> +        return 0;
> +
>      res = get_page_from_l1e(sl1e, d, d);
>  
>      /*
> --- a/xen/arch/x86/mm/shadow/types.h
> +++ b/xen/arch/x86/mm/shadow/types.h
> @@ -276,9 +276,16 @@ int shadow_set_l4e(struct domain *d, sha
>  static void inline
>  shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
>  {
> +    mfn_t mfn;
> +
>      if ( !shadow_mode_refcounts(d) )
>          return;
>  
> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&

Nit: I would prefer if assigned mfn outside of the condition, like
it's done in the chunk added to shadow_get_page_from_l1e. The rest
LGTM, so:

Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks, Roger.

Re: [PATCH v4 1/3] VMX: use a single, global APIC access page
Posted by Jan Beulich 3 years ago
On 23.04.2021 16:17, Roger Pau Monné wrote:
> On Fri, Apr 23, 2021 at 12:52:57PM +0200, Jan Beulich wrote:
>> --- a/xen/arch/x86/mm/shadow/set.c
>> +++ b/xen/arch/x86/mm/shadow/set.c
>> @@ -94,6 +94,15 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
>>      ASSERT(!sh_l1e_is_magic(sl1e));
>>      ASSERT(shadow_mode_refcounts(d));
>>  
>> +    /*
>> +     * Check whether refcounting is suppressed on this page. For example,
>> +     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
>> +     * get accessed, and hence there's no need to refcount it.
>> +     */
>> +    mfn = shadow_l1e_get_mfn(sl1e);
>> +    if ( mfn_valid(mfn) && page_refcounting_suppressed(mfn_to_page(mfn)) )
>> +        return 0;
>> +
>>      res = get_page_from_l1e(sl1e, d, d);
>>  
>>      /*
>> --- a/xen/arch/x86/mm/shadow/types.h
>> +++ b/xen/arch/x86/mm/shadow/types.h
>> @@ -276,9 +276,16 @@ int shadow_set_l4e(struct domain *d, sha
>>  static void inline
>>  shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
>>  {
>> +    mfn_t mfn;
>> +
>>      if ( !shadow_mode_refcounts(d) )
>>          return;
>>  
>> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
> 
> Nit: I would prefer if assigned mfn outside of the condition, like
> it's done in the chunk added to shadow_get_page_from_l1e.

Well, I did it differently here because the variable really is
only needed inside the if(), whereas in "get" the subsequent
patches use it elsewhere as well. I'll wait what Tim says.

> The rest LGTM, so:
> 
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Thanks.

Jan

Re: [PATCH v4 1/3] VMX: use a single, global APIC access page
Posted by Tim Deegan 2 years, 12 months ago
At 16:42 +0200 on 23 Apr (1619196141), Jan Beulich wrote:
> On 23.04.2021 16:17, Roger Pau Monné wrote:
> > On Fri, Apr 23, 2021 at 12:52:57PM +0200, Jan Beulich wrote:
> >> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
> > 
> > Nit: I would prefer if assigned mfn outside of the condition, like
> > it's done in the chunk added to shadow_get_page_from_l1e.
> 
> Well, I did it differently here because the variable really is
> only needed inside the if(), whereas in "get" the subsequent
> patches use it elsewhere as well. I'll wait what Tim says.

No strong feelings on this, but since you asked me, I would also
prefer it to be outside the condition.

Cheers,

Tim.

Re: [PATCH v4 1/3] VMX: use a single, global APIC access page
Posted by Tim Deegan 2 years, 12 months ago
At 12:52 +0200 on 23 Apr (1619182377), Jan Beulich wrote:
> The address of this page is used by the CPU only to recognize when to
> access the virtual APIC page instead. No accesses would ever go to this
> page. It only needs to be present in the (CPU) page tables so that
> address translation will produce its address as result for respective
> accesses.
> 
> By making this page global, we also eliminate the need to refcount it,
> or to assign it to any domain in the first place.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Looks good, thanks for the changes!

Acked-by: Tim Deegan <tim@xen.org>

RE: [PATCH v4 1/3] VMX: use a single, global APIC access page
Posted by Tian, Kevin 3 years ago
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Friday, April 23, 2021 6:53 PM
> 
> The address of this page is used by the CPU only to recognize when to
> access the virtual APIC page instead. No accesses would ever go to this
> page. It only needs to be present in the (CPU) page tables so that
> address translation will produce its address as result for respective
> accesses.
> 
> By making this page global, we also eliminate the need to refcount it,
> or to assign it to any domain in the first place.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Kevin Tian <kevin.tian@intel.com>

> ---
> v5: Init apic_access_mfn to INVALID_MFN. Move assignment out of if()
>     condition. Introduce page_suppress_refcounting() and
>     page_refcounting_suppressed().
> v4: Set PGC_extra on the page. Make shadow mode work.
> v3: Split p2m insertion change to a separate patch.
> v2: Avoid insertion when !has_vlapic(). Split off change to
>     p2m_get_iommu_flags().
> ---
> I did further consider not allocating any real page at all, but just
> using the address of some unpopulated space (which would require
> announcing this page as reserved to Dom0, so it wouldn't put any PCI
> MMIO BARs there). But I thought this would be too controversial, because
> of the possible risks associated with this.
> 
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -66,8 +66,7 @@ boolean_param("force-ept", opt_force_ept
>  static void vmx_ctxt_switch_from(struct vcpu *v);
>  static void vmx_ctxt_switch_to(struct vcpu *v);
> 
> -static int  vmx_alloc_vlapic_mapping(struct domain *d);
> -static void vmx_free_vlapic_mapping(struct domain *d);
> +static int alloc_vlapic_mapping(void);
>  static void vmx_install_vlapic_mapping(struct vcpu *v);
>  static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr,
>                                  unsigned int flags);
> @@ -78,6 +77,8 @@ static int vmx_msr_read_intercept(unsign
>  static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content);
>  static void vmx_invlpg(struct vcpu *v, unsigned long linear);
> 
> +static mfn_t __read_mostly apic_access_mfn = INVALID_MFN_INITIALIZER;
> +
>  /* Values for domain's ->arch.hvm_domain.pi_ops.flags. */
>  #define PI_CSW_FROM (1u << 0)
>  #define PI_CSW_TO   (1u << 1)
> @@ -401,7 +402,6 @@ static int vmx_domain_initialise(struct
>          .to   = vmx_ctxt_switch_to,
>          .tail = vmx_do_resume,
>      };
> -    int rc;
> 
>      d->arch.ctxt_switch = &csw;
> 
> @@ -411,28 +411,22 @@ static int vmx_domain_initialise(struct
>       */
>      d->arch.hvm.vmx.exec_sp = is_hardware_domain(d) || opt_ept_exec_sp;
> 
> -    if ( !has_vlapic(d) )
> -        return 0;
> -
> -    if ( (rc = vmx_alloc_vlapic_mapping(d)) != 0 )
> -        return rc;
> -
>      return 0;
>  }
> 
> -static void vmx_domain_relinquish_resources(struct domain *d)
> +static void domain_creation_finished(struct domain *d)
>  {
> -    if ( !has_vlapic(d) )
> +    gfn_t gfn = gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE);
> +    uint8_t ipat;
> +
> +    if ( !has_vlapic(d) || mfn_eq(apic_access_mfn, INVALID_MFN) )
>          return;
> 
> -    vmx_free_vlapic_mapping(d);
> -}
> +    ASSERT(epte_get_entry_emt(d, gfn_x(gfn), apic_access_mfn, 0, &ipat,
> +                              true) == MTRR_TYPE_WRBACK);
> +    ASSERT(ipat);
> 
> -static void domain_creation_finished(struct domain *d)
> -{
> -    if ( has_vlapic(d) && !mfn_eq(d->arch.hvm.vmx.apic_access_mfn,
> _mfn(0)) &&
> -         set_mmio_p2m_entry(d, gaddr_to_gfn(APIC_DEFAULT_PHYS_BASE),
> -                            d->arch.hvm.vmx.apic_access_mfn, PAGE_ORDER_4K) )
> +    if ( set_mmio_p2m_entry(d, gfn, apic_access_mfn, PAGE_ORDER_4K) )
>          domain_crash(d);
>  }
> 
> @@ -2415,7 +2409,6 @@ static struct hvm_function_table __initd
>      .cpu_up_prepare       = vmx_cpu_up_prepare,
>      .cpu_dead             = vmx_cpu_dead,
>      .domain_initialise    = vmx_domain_initialise,
> -    .domain_relinquish_resources = vmx_domain_relinquish_resources,
>      .domain_creation_finished = domain_creation_finished,
>      .vcpu_initialise      = vmx_vcpu_initialise,
>      .vcpu_destroy         = vmx_vcpu_destroy,
> @@ -2662,7 +2655,7 @@ const struct hvm_function_table * __init
>  {
>      set_in_cr4(X86_CR4_VMXE);
> 
> -    if ( vmx_vmcs_init() )
> +    if ( vmx_vmcs_init() || alloc_vlapic_mapping() )
>      {
>          printk("VMX: failed to initialise.\n");
>          return NULL;
> @@ -3224,7 +3217,7 @@ gp_fault:
>      return X86EMUL_EXCEPTION;
>  }
> 
> -static int vmx_alloc_vlapic_mapping(struct domain *d)
> +static int __init alloc_vlapic_mapping(void)
>  {
>      struct page_info *pg;
>      mfn_t mfn;
> @@ -3232,52 +3225,34 @@ static int vmx_alloc_vlapic_mapping(stru
>      if ( !cpu_has_vmx_virtualize_apic_accesses )
>          return 0;
> 
> -    pg = alloc_domheap_page(d, MEMF_no_refcount);
> +    pg = alloc_domheap_page(NULL, 0);
>      if ( !pg )
>          return -ENOMEM;
> 
> -    if ( !get_page_and_type(pg, d, PGT_writable_page) )
> -    {
> -        /*
> -         * The domain can't possibly know about this page yet, so failure
> -         * here is a clear indication of something fishy going on.
> -         */
> -        domain_crash(d);
> -        return -ENODATA;
> -    }
> +    /*
> +     * Signal to shadow code that this page cannot be refcounted. This also
> +     * makes epte_get_entry_emt() recognize this page as "special".
> +     */
> +    page_suppress_refcounting(pg);
> 
>      mfn = page_to_mfn(pg);
>      clear_domain_page(mfn);
> -    d->arch.hvm.vmx.apic_access_mfn = mfn;
> +    apic_access_mfn = mfn;
> 
>      return 0;
>  }
> 
> -static void vmx_free_vlapic_mapping(struct domain *d)
> -{
> -    mfn_t mfn = d->arch.hvm.vmx.apic_access_mfn;
> -
> -    d->arch.hvm.vmx.apic_access_mfn = _mfn(0);
> -    if ( !mfn_eq(mfn, _mfn(0)) )
> -    {
> -        struct page_info *pg = mfn_to_page(mfn);
> -
> -        put_page_alloc_ref(pg);
> -        put_page_and_type(pg);
> -    }
> -}
> -
>  static void vmx_install_vlapic_mapping(struct vcpu *v)
>  {
>      paddr_t virt_page_ma, apic_page_ma;
> 
> -    if ( mfn_eq(v->domain->arch.hvm.vmx.apic_access_mfn, _mfn(0)) )
> +    if ( !has_vlapic(v->domain) || mfn_eq(apic_access_mfn, INVALID_MFN) )
>          return;
> 
>      ASSERT(cpu_has_vmx_virtualize_apic_accesses);
> 
>      virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
> -    apic_page_ma = mfn_to_maddr(v->domain-
> >arch.hvm.vmx.apic_access_mfn);
> +    apic_page_ma = mfn_to_maddr(apic_access_mfn);
> 
>      vmx_vmcs_enter(v);
>      __vmwrite(VIRTUAL_APIC_PAGE_ADDR, virt_page_ma);
> --- a/xen/arch/x86/mm/shadow/set.c
> +++ b/xen/arch/x86/mm/shadow/set.c
> @@ -94,6 +94,15 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
>      ASSERT(!sh_l1e_is_magic(sl1e));
>      ASSERT(shadow_mode_refcounts(d));
> 
> +    /*
> +     * Check whether refcounting is suppressed on this page. For example,
> +     * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
> +     * get accessed, and hence there's no need to refcount it.
> +     */
> +    mfn = shadow_l1e_get_mfn(sl1e);
> +    if ( mfn_valid(mfn) &&
> page_refcounting_suppressed(mfn_to_page(mfn)) )
> +        return 0;
> +
>      res = get_page_from_l1e(sl1e, d, d);
> 
>      /*
> --- a/xen/arch/x86/mm/shadow/types.h
> +++ b/xen/arch/x86/mm/shadow/types.h
> @@ -276,9 +276,16 @@ int shadow_set_l4e(struct domain *d, sha
>  static void inline
>  shadow_put_page_from_l1e(shadow_l1e_t sl1e, struct domain *d)
>  {
> +    mfn_t mfn;
> +
>      if ( !shadow_mode_refcounts(d) )
>          return;
> 
> +    if ( mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
> +         /* See the respective comment in shadow_get_page_from_l1e(). */
> +         page_refcounting_suppressed(mfn_to_page(mfn)) )
> +        return;
> +
>      put_page_from_l1e(sl1e, d);
>  }
> 
> --- a/xen/include/asm-x86/hvm/vmx/vmcs.h
> +++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
> @@ -58,7 +58,6 @@ struct ept_data {
>  #define _VMX_DOMAIN_PML_ENABLED    0
>  #define VMX_DOMAIN_PML_ENABLED     (1ul <<
> _VMX_DOMAIN_PML_ENABLED)
>  struct vmx_domain {
> -    mfn_t apic_access_mfn;
>      /* VMX_DOMAIN_* */
>      unsigned int status;
> 
> --- a/xen/include/asm-x86/mm.h
> +++ b/xen/include/asm-x86/mm.h
> @@ -82,7 +82,7 @@
>  #define PGC_state_offlined PG_mask(2, 9)
>  #define PGC_state_free    PG_mask(3, 9)
>  #define page_state_is(pg, st) (((pg)->count_info&PGC_state) ==
> PGC_state_##st)
> -/* Page is not reference counted */
> +/* Page is not reference counted (see below for caveats) */
>  #define _PGC_extra        PG_shift(10)
>  #define PGC_extra         PG_mask(1, 10)
> 
> @@ -374,6 +374,24 @@ void zap_ro_mpt(mfn_t mfn);
> 
>  bool is_iomem_page(mfn_t mfn);
> 
> +/*
> + * Pages with no owner which may get passed to functions wanting to
> + * refcount them can be marked PGC_extra to bypass this refcounting
> (which
> + * would fail due to the lack of an owner).
> + *
> + * (For pages with owner PGC_extra has different meaning.)
> + */
> +static inline void page_suppress_refcounting(struct page_info *pg)
> +{
> +   ASSERT(!page_get_owner(pg));
> +   pg->count_info |= PGC_extra;
> +}
> +
> +static inline bool page_refcounting_suppressed(const struct page_info *pg)
> +{
> +    return !page_get_owner(pg) && (pg->count_info & PGC_extra);
> +}
> +
>  struct platform_bad_page {
>      unsigned long mfn;
>      unsigned int order;

[PATCH v4 2/3] x86/shadow: re-use variables in shadow_get_page_from_l1e()
Posted by Jan Beulich 3 years ago
There's little point in doing multiple mfn_to_page() or page_get_owner()
on all the same MFN. Calculate them once at the start of the function.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
---
v5: Integrate into series. Re-base.
v2: Re-base.

--- a/xen/arch/x86/mm/shadow/set.c
+++ b/xen/arch/x86/mm/shadow/set.c
@@ -88,19 +88,25 @@ static int inline
 shadow_get_page_from_l1e(shadow_l1e_t sl1e, struct domain *d, p2m_type_t type)
 {
     int res;
-    mfn_t mfn;
-    struct domain *owner;
+    mfn_t mfn = shadow_l1e_get_mfn(sl1e);
+    const struct page_info *pg = NULL;
+    struct domain *owner = NULL;
 
     ASSERT(!sh_l1e_is_magic(sl1e));
     ASSERT(shadow_mode_refcounts(d));
 
+    if ( mfn_valid(mfn) )
+    {
+        pg = mfn_to_page(mfn);
+        owner = page_get_owner(pg);
+    }
+
     /*
      * Check whether refcounting is suppressed on this page. For example,
      * VMX'es APIC access MFN is just a surrogate page.  It doesn't actually
      * get accessed, and hence there's no need to refcount it.
      */
-    mfn = shadow_l1e_get_mfn(sl1e);
-    if ( mfn_valid(mfn) && page_refcounting_suppressed(mfn_to_page(mfn)) )
+    if ( pg && page_refcounting_suppressed(pg) )
         return 0;
 
     res = get_page_from_l1e(sl1e, d, d);
@@ -111,9 +117,7 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
      */
     if ( unlikely(res < 0) &&
          !shadow_mode_translate(d) &&
-         mfn_valid(mfn = shadow_l1e_get_mfn(sl1e)) &&
-         (owner = page_get_owner(mfn_to_page(mfn))) &&
-         (d != owner) )
+         owner && (d != owner) )
     {
         res = xsm_priv_mapping(XSM_TARGET, d, owner);
         if ( !res )
@@ -136,9 +140,8 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
          * already have checked that we're supposed to have access, so
          * we can just grab a reference directly.
          */
-        mfn = shadow_l1e_get_mfn(sl1e);
-        if ( mfn_valid(mfn) )
-            res = get_page_from_l1e(sl1e, d, page_get_owner(mfn_to_page(mfn)));
+        if ( owner )
+            res = get_page_from_l1e(sl1e, d, owner);
     }
 
     if ( unlikely(res < 0) )


[PATCH v4 3/3] x86/shadow: streamline shadow_get_page_from_l1e()
Posted by Jan Beulich 3 years ago
Trying get_page_from_l1e() up to three times isn't helpful; in debug
builds it may lead to log messages making things look as if there was a
problem somewhere. And there's no need to have more than one try: The
function can only possibly succeed for one domain passed as 3rd
argument (unless the page is an MMIO one to which both have access,
but MMIO pages should be "got" by specifying the requesting domain
anyway). Re-arrange things so just the one call gets made which has a
chance of succeeding.

The code could in principle be arranged such that there's only a single
call to get_page_from_l1e(), but the conditional would become pretty
complex then and hence hard to follow / understand / adjust.

The redundant (with shadow_mode_refcounts()) shadow_mode_translate()
gets dropped.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Tim Deegan <tim@xen.org>
---
v5: Integrate into series. Re-base.

--- a/xen/arch/x86/mm/shadow/set.c
+++ b/xen/arch/x86/mm/shadow/set.c
@@ -109,40 +109,36 @@ shadow_get_page_from_l1e(shadow_l1e_t sl
     if ( pg && page_refcounting_suppressed(pg) )
         return 0;
 
-    res = get_page_from_l1e(sl1e, d, d);
+    if ( owner == dom_io )
+        owner = NULL;
 
     /*
      * If a privileged domain is attempting to install a map of a page it does
      * not own, we let it succeed anyway.
      */
-    if ( unlikely(res < 0) &&
-         !shadow_mode_translate(d) &&
-         owner && (d != owner) )
+    if ( owner && (d != owner) &&
+         !(res = xsm_priv_mapping(XSM_TARGET, d, owner)) )
     {
-        res = xsm_priv_mapping(XSM_TARGET, d, owner);
-        if ( !res )
-        {
-            res = get_page_from_l1e(sl1e, d, owner);
-            SHADOW_PRINTK("privileged %pd installs map of mfn %"PRI_mfn" owned by %pd: %s\n",
-                           d, mfn_x(mfn), owner,
-                           res >= 0 ? "success" : "failed");
-        }
+        res = get_page_from_l1e(sl1e, d, owner);
+        SHADOW_PRINTK("privileged %pd installs map of %pd's mfn %"PRI_mfn": %s\n",
+                      d, owner, mfn_x(mfn),
+                      res >= 0 ? "success" : "failed");
     }
-
     /* Okay, it might still be a grant mapping PTE.  Try it. */
-    if ( unlikely(res < 0) &&
-         (type == p2m_grant_map_rw ||
-          (type == p2m_grant_map_ro &&
-           !(shadow_l1e_get_flags(sl1e) & _PAGE_RW))) )
+    else if ( owner &&
+              (type == p2m_grant_map_rw ||
+               (type == p2m_grant_map_ro &&
+                !(shadow_l1e_get_flags(sl1e) & _PAGE_RW))) )
     {
         /*
          * It's a grant mapping.  The grant table implementation will
          * already have checked that we're supposed to have access, so
          * we can just grab a reference directly.
          */
-        if ( owner )
-            res = get_page_from_l1e(sl1e, d, owner);
+        res = get_page_from_l1e(sl1e, d, owner);
     }
+    else
+        res = get_page_from_l1e(sl1e, d, d);
 
     if ( unlikely(res < 0) )
     {