[v4] runstate/time area registration by (guest) physical address

[PATCH v4 5/9] x86/mem-sharing: copy GADDR based shared guest areas

Posted by Jan Beulich 2 years, 4 months ago

In preparation of the introduction of new vCPU operations allowing to
register the respective areas (one of the two is x86-specific) by
guest-physical address, add the necessary fork handling (with the
backing function yet to be filled in).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v3: Extend comment.

--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1641,6 +1641,68 @@ static void copy_vcpu_nonreg_state(struc
     hvm_set_nonreg_state(cd_vcpu, &nrs);
 }
 
+static int copy_guest_area(struct guest_area *cd_area,
+                           const struct guest_area *d_area,
+                           struct vcpu *cd_vcpu,
+                           const struct domain *d)
+{
+    mfn_t d_mfn, cd_mfn;
+
+    if ( !d_area->pg )
+        return 0;
+
+    d_mfn = page_to_mfn(d_area->pg);
+
+    /* Allocate & map a page for the area if it hasn't been already. */
+    if ( !cd_area->pg )
+    {
+        gfn_t gfn = mfn_to_gfn(d, d_mfn);
+        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
+        p2m_type_t p2mt;
+        p2m_access_t p2ma;
+        unsigned int offset;
+        int ret;
+
+        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
+        if ( mfn_eq(cd_mfn, INVALID_MFN) )
+        {
+            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
+
+            if ( !pg )
+                return -ENOMEM;
+
+            cd_mfn = page_to_mfn(pg);
+            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
+
+            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
+                                 p2m->default_access, -1);
+            if ( ret )
+                return ret;
+        }
+        else if ( p2mt != p2m_ram_rw )
+            return -EBUSY;
+
+        /*
+         * Map the area into the guest. For simplicity specify the entire range
+         * up to the end of the page: All the function uses it for is to check
+         * that the range doesn't cross page boundaries. Having the area mapped
+         * in the original domain implies that it fits there and therefore will
+         * also fit in the clone.
+         */
+        offset = PAGE_OFFSET(d_area->map);
+        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
+                             PAGE_SIZE - offset, cd_area, NULL);
+        if ( ret )
+            return ret;
+    }
+    else
+        cd_mfn = page_to_mfn(cd_area->pg);
+
+    copy_domain_page(cd_mfn, d_mfn);
+
+    return 0;
+}
+
 static int copy_vpmu(struct vcpu *d_vcpu, struct vcpu *cd_vcpu)
 {
     struct vpmu_struct *d_vpmu = vcpu_vpmu(d_vcpu);
@@ -1733,6 +1795,16 @@ static int copy_vcpu_settings(struct dom
             copy_domain_page(new_vcpu_info_mfn, vcpu_info_mfn);
         }
 
+        /* Same for the (physically registered) runstate and time info areas. */
+        ret = copy_guest_area(&cd_vcpu->runstate_guest_area,
+                              &d_vcpu->runstate_guest_area, cd_vcpu, d);
+        if ( ret )
+            return ret;
+        ret = copy_guest_area(&cd_vcpu->arch.time_guest_area,
+                              &d_vcpu->arch.time_guest_area, cd_vcpu, d);
+        if ( ret )
+            return ret;
+
         ret = copy_vpmu(d_vcpu, cd_vcpu);
         if ( ret )
             return ret;
@@ -1974,7 +2046,10 @@ int mem_sharing_fork_reset(struct domain
 
  state:
     if ( reset_state )
+    {
         rc = copy_settings(d, pd);
+        /* TBD: What to do here with -ERESTART? */
+    }
 
     domain_unpause(d);
 
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -1601,6 +1601,13 @@ void unmap_vcpu_info(struct vcpu *v)
     put_page_and_type(mfn_to_page(mfn));
 }
 
+int map_guest_area(struct vcpu *v, paddr_t gaddr, unsigned int size,
+                   struct guest_area *area,
+                   void (*populate)(void *dst, struct vcpu *v))
+{
+    return -EOPNOTSUPP;
+}
+
 /*
  * This is only intended to be used for domain cleanup (or more generally only
  * with at least the respective vCPU, if it's not the current one, reliably

Re: [PATCH v4 5/9] x86/mem-sharing: copy GADDR based shared guest areas

Posted by Roger Pau Monné 2 years, 4 months ago

On Thu, Sep 28, 2023 at 09:16:20AM +0200, Jan Beulich wrote:
> In preparation of the introduction of new vCPU operations allowing to
> register the respective areas (one of the two is x86-specific) by
> guest-physical address, add the necessary fork handling (with the
> backing function yet to be filled in).
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> v3: Extend comment.
> 
> --- a/xen/arch/x86/mm/mem_sharing.c
> +++ b/xen/arch/x86/mm/mem_sharing.c
> @@ -1641,6 +1641,68 @@ static void copy_vcpu_nonreg_state(struc
>      hvm_set_nonreg_state(cd_vcpu, &nrs);
>  }
>  
> +static int copy_guest_area(struct guest_area *cd_area,
> +                           const struct guest_area *d_area,
> +                           struct vcpu *cd_vcpu,
> +                           const struct domain *d)
> +{
> +    mfn_t d_mfn, cd_mfn;
> +
> +    if ( !d_area->pg )
> +        return 0;
> +
> +    d_mfn = page_to_mfn(d_area->pg);
> +
> +    /* Allocate & map a page for the area if it hasn't been already. */
> +    if ( !cd_area->pg )
> +    {
> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
> +        p2m_type_t p2mt;
> +        p2m_access_t p2ma;
> +        unsigned int offset;
> +        int ret;
> +
> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> +        {
> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
> +
> +            if ( !pg )
> +                return -ENOMEM;
> +
> +            cd_mfn = page_to_mfn(pg);
> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> +
> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> +                                 p2m->default_access, -1);
> +            if ( ret )
> +                return ret;
> +        }

I'm still unsure why map_guest_area() shouldn't be able to deal with a
forked child needing the page to be mapped.  What happens when a
forked child executes the hypercall to map such areas against not yet
populated gfns?

Shouldn't map_guest_area() be capable of handling those calls and
populating on demand?

> +        else if ( p2mt != p2m_ram_rw )
> +            return -EBUSY;
> +
> +        /*
> +         * Map the area into the guest. For simplicity specify the entire range
> +         * up to the end of the page: All the function uses it for is to check
> +         * that the range doesn't cross page boundaries. Having the area mapped
> +         * in the original domain implies that it fits there and therefore will
> +         * also fit in the clone.
> +         */
> +        offset = PAGE_OFFSET(d_area->map);
> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
> +                             PAGE_SIZE - offset, cd_area, NULL);
> +        if ( ret )
> +            return ret;
> +    }
> +    else
> +        cd_mfn = page_to_mfn(cd_area->pg);
> +
> +    copy_domain_page(cd_mfn, d_mfn);

I think the page copy should be done only once, when the page is
populated on the child p2m.  Otherwise areas smaller than a page size
(like vpcu_time_info_t) that share the same page will get multiple
copies of the same data for no reason.

Thanks, Roger.

Re: [PATCH v4 5/9] x86/mem-sharing: copy GADDR based shared guest areas

Posted by Jan Beulich 2 years, 4 months ago

On 28.09.2023 11:51, Roger Pau Monné wrote:
> On Thu, Sep 28, 2023 at 09:16:20AM +0200, Jan Beulich wrote:
>> --- a/xen/arch/x86/mm/mem_sharing.c
>> +++ b/xen/arch/x86/mm/mem_sharing.c
>> @@ -1641,6 +1641,68 @@ static void copy_vcpu_nonreg_state(struc
>>      hvm_set_nonreg_state(cd_vcpu, &nrs);
>>  }
>>  
>> +static int copy_guest_area(struct guest_area *cd_area,
>> +                           const struct guest_area *d_area,
>> +                           struct vcpu *cd_vcpu,
>> +                           const struct domain *d)
>> +{
>> +    mfn_t d_mfn, cd_mfn;
>> +
>> +    if ( !d_area->pg )
>> +        return 0;
>> +
>> +    d_mfn = page_to_mfn(d_area->pg);
>> +
>> +    /* Allocate & map a page for the area if it hasn't been already. */
>> +    if ( !cd_area->pg )
>> +    {
>> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
>> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
>> +        p2m_type_t p2mt;
>> +        p2m_access_t p2ma;
>> +        unsigned int offset;
>> +        int ret;
>> +
>> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
>> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
>> +        {
>> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
>> +
>> +            if ( !pg )
>> +                return -ENOMEM;
>> +
>> +            cd_mfn = page_to_mfn(pg);
>> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
>> +
>> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
>> +                                 p2m->default_access, -1);
>> +            if ( ret )
>> +                return ret;
>> +        }
> 
> I'm still unsure why map_guest_area() shouldn't be able to deal with a
> forked child needing the page to be mapped.  What happens when a
> forked child executes the hypercall to map such areas against not yet
> populated gfns?
> 
> Shouldn't map_guest_area() be capable of handling those calls and
> populating on demand?

Funny you should use this wording: P2M_ALLOC will deal with populating
PoD slots, yes, but aiui forks don't fill their p2m with all PoD slots,
but rather leave them empty. Yet again I need to leave it to Tamas to
confirm or prove me wrong.

>> +        else if ( p2mt != p2m_ram_rw )
>> +            return -EBUSY;
>> +
>> +        /*
>> +         * Map the area into the guest. For simplicity specify the entire range
>> +         * up to the end of the page: All the function uses it for is to check
>> +         * that the range doesn't cross page boundaries. Having the area mapped
>> +         * in the original domain implies that it fits there and therefore will
>> +         * also fit in the clone.
>> +         */
>> +        offset = PAGE_OFFSET(d_area->map);
>> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
>> +                             PAGE_SIZE - offset, cd_area, NULL);
>> +        if ( ret )
>> +            return ret;
>> +    }
>> +    else
>> +        cd_mfn = page_to_mfn(cd_area->pg);
>> +
>> +    copy_domain_page(cd_mfn, d_mfn);
> 
> I think the page copy should be done only once, when the page is
> populated on the child p2m.  Otherwise areas smaller than a page size
> (like vpcu_time_info_t) that share the same page will get multiple
> copies of the same data for no reason.

I think you're right, but this would then be another issue in the original
code that I merely didn't spot (and it's not just "copy for no reason",
we'd actually corrupt what was put there before). IOW the copying needs to
move ahead of map_guest_area() (or yet more precisely after the error
checking for p2m->set_entry()), and in the original code it would have
needed to live ahead of map_vcpu_info(). Once again I'd like Tamas to
confirm (or otherwise) before making that change, though.

Jan

Re: [PATCH v4 5/9] x86/mem-sharing: copy GADDR based shared guest areas

Posted by Roger Pau Monné 2 years, 4 months ago

On Thu, Sep 28, 2023 at 12:11:02PM +0200, Jan Beulich wrote:
> On 28.09.2023 11:51, Roger Pau Monné wrote:
> > On Thu, Sep 28, 2023 at 09:16:20AM +0200, Jan Beulich wrote:
> >> --- a/xen/arch/x86/mm/mem_sharing.c
> >> +++ b/xen/arch/x86/mm/mem_sharing.c
> >> @@ -1641,6 +1641,68 @@ static void copy_vcpu_nonreg_state(struc
> >>      hvm_set_nonreg_state(cd_vcpu, &nrs);
> >>  }
> >>  
> >> +static int copy_guest_area(struct guest_area *cd_area,
> >> +                           const struct guest_area *d_area,
> >> +                           struct vcpu *cd_vcpu,
> >> +                           const struct domain *d)
> >> +{
> >> +    mfn_t d_mfn, cd_mfn;
> >> +
> >> +    if ( !d_area->pg )
> >> +        return 0;
> >> +
> >> +    d_mfn = page_to_mfn(d_area->pg);
> >> +
> >> +    /* Allocate & map a page for the area if it hasn't been already. */
> >> +    if ( !cd_area->pg )
> >> +    {
> >> +        gfn_t gfn = mfn_to_gfn(d, d_mfn);
> >> +        struct p2m_domain *p2m = p2m_get_hostp2m(cd_vcpu->domain);
> >> +        p2m_type_t p2mt;
> >> +        p2m_access_t p2ma;
> >> +        unsigned int offset;
> >> +        int ret;
> >> +
> >> +        cd_mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL);
> >> +        if ( mfn_eq(cd_mfn, INVALID_MFN) )
> >> +        {
> >> +            struct page_info *pg = alloc_domheap_page(cd_vcpu->domain, 0);
> >> +
> >> +            if ( !pg )
> >> +                return -ENOMEM;
> >> +
> >> +            cd_mfn = page_to_mfn(pg);
> >> +            set_gpfn_from_mfn(mfn_x(cd_mfn), gfn_x(gfn));
> >> +
> >> +            ret = p2m->set_entry(p2m, gfn, cd_mfn, PAGE_ORDER_4K, p2m_ram_rw,
> >> +                                 p2m->default_access, -1);
> >> +            if ( ret )
> >> +                return ret;
> >> +        }
> > 
> > I'm still unsure why map_guest_area() shouldn't be able to deal with a
> > forked child needing the page to be mapped.  What happens when a
> > forked child executes the hypercall to map such areas against not yet
> > populated gfns?
> > 
> > Shouldn't map_guest_area() be capable of handling those calls and
> > populating on demand?
> 
> Funny you should use this wording: P2M_ALLOC will deal with populating
> PoD slots, yes, but aiui forks don't fill their p2m with all PoD slots,
> but rather leave them empty. Yet again I need to leave it to Tamas to
> confirm or prove me wrong.

If the child p2m populating is only triggered by guest accesses then a
lot of hypercalls are likely to not work correctly on childs.

> 
> >> +        else if ( p2mt != p2m_ram_rw )
> >> +            return -EBUSY;
> >> +
> >> +        /*
> >> +         * Map the area into the guest. For simplicity specify the entire range
> >> +         * up to the end of the page: All the function uses it for is to check
> >> +         * that the range doesn't cross page boundaries. Having the area mapped
> >> +         * in the original domain implies that it fits there and therefore will
> >> +         * also fit in the clone.
> >> +         */
> >> +        offset = PAGE_OFFSET(d_area->map);
> >> +        ret = map_guest_area(cd_vcpu, gfn_to_gaddr(gfn) + offset,
> >> +                             PAGE_SIZE - offset, cd_area, NULL);
> >> +        if ( ret )
> >> +            return ret;
> >> +    }
> >> +    else
> >> +        cd_mfn = page_to_mfn(cd_area->pg);
> >> +
> >> +    copy_domain_page(cd_mfn, d_mfn);
> > 
> > I think the page copy should be done only once, when the page is
> > populated on the child p2m.  Otherwise areas smaller than a page size
> > (like vpcu_time_info_t) that share the same page will get multiple
> > copies of the same data for no reason.
> 
> I think you're right, but this would then be another issue in the original
> code that I merely didn't spot (and it's not just "copy for no reason",
> we'd actually corrupt what was put there before). IOW the copying needs to
> move ahead of map_guest_area() (or yet more precisely after the error
> checking for p2m->set_entry()), and in the original code it would have
> needed to live ahead of map_vcpu_info(). Once again I'd like Tamas to
> confirm (or otherwise) before making that change, though.

Yes, it's already an issue in the current code.  I wonder whether
logic in the guest or Xen could malfunctions due to the fact that
map_vcpu_info() unconditionally sets evtchn_upcall_pending and injects
an event channel upcall, but the later call to copy_domain_page()
might unset evtchn_upcall_pending while the vector is already injected.

Thanks, Roger.