x86/hvm: allow for more fine grained assisted flush

[PATCH] x86/hvm: allow for more fine grained assisted flush

Posted by Roger Pau Monne 3 years, 12 months ago

Improve the assisted flush by expanding the interface and allowing for
more fine grained TLB flushes to be issued using the HVMOP_flush_tlbs
hypercall. Support for such advanced flushes is signaled in CPUID
using the XEN_HVM_CPUID_ADVANCED_FLUSH flag.

The new features make use of the NULL parameter so far passed in the
hypercall in order to convey extra data to perform more selective
flushes: a virtual address, an order field, a flags field and finally a
vCPU bitmap. Note that not all features are implemented as part of
this patch, but are already added to the interface in order to avoid
having to introduce yet a new CPUID flag when the new features are
added.

The feature currently implemented is the usage of a guest provided
vCPU bitmap in order to signal which vCPUs require a TLB flush,
instead of assuming all vCPUs must be flushed. Note that not
implementing the rest of the features just make the flush less
efficient, but it's still correct and safe.

Finally add support for Xen running in guest mode (Xen on Xen or PV
shim mode) to use the newly introduced flush options when available.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/guest/xen/xen.c        | 22 +++++++++++++++++-
 xen/arch/x86/hvm/hvm.c              | 35 ++++++++++++++++++++++++-----
 xen/arch/x86/traps.c                |  4 +++-
 xen/include/public/arch-x86/cpuid.h |  2 ++
 xen/include/public/hvm/hvm_op.h     | 23 ++++++++++++++++++-
 5 files changed, 78 insertions(+), 8 deletions(-)

diff --git a/xen/arch/x86/guest/xen/xen.c b/xen/arch/x86/guest/xen/xen.c
index 2ff63d370a..310ce0c6fb 100644
--- a/xen/arch/x86/guest/xen/xen.c
+++ b/xen/arch/x86/guest/xen/xen.c
@@ -326,7 +326,27 @@ static void __init e820_fixup(struct e820map *e820)
 
 static int flush_tlb(const cpumask_t *mask, const void *va, unsigned int flags)
 {
-    return xen_hypercall_hvm_op(HVMOP_flush_tlbs, NULL);
+    xen_hvm_flush_tlbs_t op = {
+        .va = (unsigned long)va,
+        .order = (flags - 1) & FLUSH_ORDER_MASK,
+        .flags = ((flags & FLUSH_TLB_GLOBAL) ? HVMOP_flush_global : 0) |
+                 ((flags & FLUSH_VA_VALID) ? HVMOP_flush_va_valid : 0),
+        .mask_size = BITS_TO_LONGS(nr_cpu_ids),
+    };
+    static int8_t __read_mostly advanced_flush = -1;
+
+    if ( advanced_flush == -1 )
+    {
+        uint32_t eax, ebx, ecx, edx;
+
+        ASSERT(xen_cpuid_base);
+        cpuid(xen_cpuid_base + 4, &eax, &ebx, &ecx, &edx);
+        advanced_flush = (eax & XEN_HVM_CPUID_ADVANCED_FLUSH) ? 1 : 0;
+    }
+
+    set_xen_guest_handle(op.vcpu_mask, cpumask_bits(mask));
+
+    return xen_hypercall_hvm_op(HVMOP_flush_tlbs, advanced_flush ? &op : NULL);
 }
 
 static const struct hypervisor_ops __initconstrel ops = {
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 814b7020d8..1d41b6e2ea 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -4011,17 +4011,42 @@ static void hvm_s3_resume(struct domain *d)
     }
 }
 
-static bool always_flush(void *ctxt, struct vcpu *v)
+static bool flush_check(void *mask, struct vcpu *v)
 {
-    return true;
+    return mask ? test_bit(v->vcpu_id, (unsigned long *)mask) : true;
 }
 
-static int hvmop_flush_tlb_all(void)
+static int hvmop_flush_tlb(XEN_GUEST_HANDLE_PARAM(xen_hvm_flush_tlbs_t) uop)
 {
+    xen_hvm_flush_tlbs_t op;
+    DECLARE_BITMAP(mask, HVM_MAX_VCPUS) = { };
+    bool valid_mask = false;
+
     if ( !is_hvm_domain(current->domain) )
         return -EINVAL;
 
-    return paging_flush_tlb(always_flush, NULL) ? 0 : -ERESTART;
+    if ( !guest_handle_is_null(uop) )
+    {
+        if ( copy_from_guest(&op, uop, 1) )
+            return -EFAULT;
+
+        /*
+         * TODO: implement support for the other features present in
+         * xen_hvm_flush_tlbs_t: flushing a specific virtual address and not
+         * flushing global mappings.
+         */
+
+        if ( op.mask_size > ARRAY_SIZE(mask) )
+            return -EINVAL;
+
+        if ( copy_from_guest(mask, op.vcpu_mask, op.mask_size) )
+            return -EFAULT;
+
+        valid_mask = true;
+    }
+
+    return paging_flush_tlb(flush_check, valid_mask ? mask : NULL) ? 0
+                                                                   : -ERESTART;
 }
 
 static int hvmop_set_evtchn_upcall_vector(
@@ -5017,7 +5042,7 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
         break;
 
     case HVMOP_flush_tlbs:
-        rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -EINVAL;
+        rc = hvmop_flush_tlb(guest_handle_cast(arg, xen_hvm_flush_tlbs_t));
         break;
 
     case HVMOP_get_mem_type:
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index e838846c6b..b07da2fd33 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -966,8 +966,10 @@ void cpuid_hypervisor_leaves(const struct vcpu *v, uint32_t leaf,
         /*
          * Indicate that memory mapped from other domains (either grants or
          * foreign pages) has valid IOMMU entries.
+         *
+         * Also signal support for more selective assisted flush support.
          */
-        res->a |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
+        res->a |= XEN_HVM_CPUID_IOMMU_MAPPINGS | XEN_HVM_CPUID_ADVANCED_FLUSH;
 
         /* Indicate presence of vcpu id and set it in ebx */
         res->a |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
diff --git a/xen/include/public/arch-x86/cpuid.h b/xen/include/public/arch-x86/cpuid.h
index ce46305bee..b980ed91f6 100644
--- a/xen/include/public/arch-x86/cpuid.h
+++ b/xen/include/public/arch-x86/cpuid.h
@@ -102,6 +102,8 @@
 #define XEN_HVM_CPUID_IOMMU_MAPPINGS   (1u << 2)
 #define XEN_HVM_CPUID_VCPU_ID_PRESENT  (1u << 3) /* vcpu id is present in EBX */
 #define XEN_HVM_CPUID_DOMID_PRESENT    (1u << 4) /* domid is present in ECX */
+/* Supports more fine grained assisted flush, see HVMOP_flush_tlbs. */
+#define XEN_HVM_CPUID_ADVANCED_FLUSH   (1u << 5)
 
 /*
  * Leaf 6 (0x40000x05)
diff --git a/xen/include/public/hvm/hvm_op.h b/xen/include/public/hvm/hvm_op.h
index 870ec52060..ca6ae678bc 100644
--- a/xen/include/public/hvm/hvm_op.h
+++ b/xen/include/public/hvm/hvm_op.h
@@ -99,8 +99,29 @@ DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t);
 
 #endif /* __XEN_INTERFACE_VERSION__ < 0x00040900 */
 
-/* Flushes all VCPU TLBs: @arg must be NULL. */
+/*
+ * Flushes all VCPU TLBs: @arg can be NULL or xen_hvm_flush_tlbs_t.
+ *
+ * Support for passing a xen_hvm_flush_tlbs_t parameter is signaled in CPUID,
+ * see XEN_HVM_CPUID_ADVANCED_FLUSH.
+ */
 #define HVMOP_flush_tlbs          5
+struct xen_hvm_flush_tlbs {
+    /* Virtual address to be flushed. */
+    uint64_t va;
+    uint16_t order;
+    uint16_t flags;
+/* Flush global mappings. */
+#define HVMOP_flush_global      (1u << 0)
+/* VA for the flush has a valid mapping. */
+#define HVMOP_flush_va_valid    (1u << 1)
+    /* Number of uint64_t elements in vcpu_mask. */
+    uint32_t mask_size;
+    /* Bitmask of vcpus that should be flushed. */
+    XEN_GUEST_HANDLE(const_uint64) vcpu_mask;
+};
+typedef struct xen_hvm_flush_tlbs xen_hvm_flush_tlbs_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_flush_tlbs_t);
 
 /*
  * hvmmem_type_t should not be defined when generating the corresponding
-- 
2.26.0

Re: [PATCH] x86/hvm: allow for more fine grained assisted flush

Posted by Jan Beulich 3 years, 11 months ago

On 30.04.2020 11:17, Roger Pau Monne wrote:
> Improve the assisted flush by expanding the interface and allowing for
> more fine grained TLB flushes to be issued using the HVMOP_flush_tlbs
> hypercall. Support for such advanced flushes is signaled in CPUID
> using the XEN_HVM_CPUID_ADVANCED_FLUSH flag.
> 
> The new features make use of the NULL parameter so far passed in the
> hypercall in order to convey extra data to perform more selective
> flushes: a virtual address, an order field, a flags field and finally a
> vCPU bitmap. Note that not all features are implemented as part of
> this patch, but are already added to the interface in order to avoid
> having to introduce yet a new CPUID flag when the new features are
> added.
> 
> The feature currently implemented is the usage of a guest provided
> vCPU bitmap in order to signal which vCPUs require a TLB flush,
> instead of assuming all vCPUs must be flushed. Note that not
> implementing the rest of the features just make the flush less
> efficient, but it's still correct and safe.

A risk of not supporting these right away is that guest bugs may go
unnoticed for quite some time. Just as a remark, not as a request
to do the implementation right away.

> --- a/xen/arch/x86/guest/xen/xen.c
> +++ b/xen/arch/x86/guest/xen/xen.c
> @@ -326,7 +326,27 @@ static void __init e820_fixup(struct e820map *e820)
>  
>  static int flush_tlb(const cpumask_t *mask, const void *va, unsigned int flags)
>  {
> -    return xen_hypercall_hvm_op(HVMOP_flush_tlbs, NULL);
> +    xen_hvm_flush_tlbs_t op = {
> +        .va = (unsigned long)va,
> +        .order = (flags - 1) & FLUSH_ORDER_MASK,

I consider such an expression as reasonable to use when there's a
single, central place (in flushtlb.c). For uses elsewhere (and
then better mirrored to the original place) I think we want to
have a macro inverting FLUSH_ORDER(), e.g. FLUSH_GET_ORDER().

> +        .flags = ((flags & FLUSH_TLB_GLOBAL) ? HVMOP_flush_global : 0) |
> +                 ((flags & FLUSH_VA_VALID) ? HVMOP_flush_va_valid : 0),
> +        .mask_size = BITS_TO_LONGS(nr_cpu_ids),
> +    };
> +    static int8_t __read_mostly advanced_flush = -1;
> +
> +    if ( advanced_flush == -1 )
> +    {
> +        uint32_t eax, ebx, ecx, edx;
> +
> +        ASSERT(xen_cpuid_base);
> +        cpuid(xen_cpuid_base + 4, &eax, &ebx, &ecx, &edx);
> +        advanced_flush = (eax & XEN_HVM_CPUID_ADVANCED_FLUSH) ? 1 : 0;

Use the more conventional (in our code base) !! here? Also use
cpuid_eax(), to avoid several dead locals?

> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -4011,17 +4011,42 @@ static void hvm_s3_resume(struct domain *d)
>      }
>  }
>  
> -static bool always_flush(void *ctxt, struct vcpu *v)
> +static bool flush_check(void *mask, struct vcpu *v)

const twice?

>  {
> -    return true;
> +    return mask ? test_bit(v->vcpu_id, (unsigned long *)mask) : true;

And a 3rd time?

>  }
>  
> -static int hvmop_flush_tlb_all(void)
> +static int hvmop_flush_tlb(XEN_GUEST_HANDLE_PARAM(xen_hvm_flush_tlbs_t) uop)
>  {
> +    xen_hvm_flush_tlbs_t op;

This could move into the more narrow scope below.

> +    DECLARE_BITMAP(mask, HVM_MAX_VCPUS) = { };
> +    bool valid_mask = false;
> +
>      if ( !is_hvm_domain(current->domain) )
>          return -EINVAL;
>  
> -    return paging_flush_tlb(always_flush, NULL) ? 0 : -ERESTART;
> +    if ( !guest_handle_is_null(uop) )
> +    {
> +        if ( copy_from_guest(&op, uop, 1) )
> +            return -EFAULT;
> +
> +        /*
> +         * TODO: implement support for the other features present in
> +         * xen_hvm_flush_tlbs_t: flushing a specific virtual address and not
> +         * flushing global mappings.
> +         */
> +
> +        if ( op.mask_size > ARRAY_SIZE(mask) )
> +            return -EINVAL;

While a proper safeguard for the implementation, this looks rather
arbitrary from the guests's pov: Bits beyond the guest's vCPU count
aren't meaningful anyway. They could be ignored by not copying here,
rather than by never inspecting them in flush_check(). And ignoring
some bits here would then call for this to be done consistently for
all of them, i.e. not returning -EINVAL.

> +        if ( copy_from_guest(mask, op.vcpu_mask, op.mask_size) )
> +            return -EFAULT;
> +
> +        valid_mask = true;
> +    }
> +
> +    return paging_flush_tlb(flush_check, valid_mask ? mask : NULL) ? 0
> +                                                                   : -ERESTART;

Just as a suggestion, this might look a little less odd when wrapped
as

    return paging_flush_tlb(flush_check,
                            valid_mask ? mask : NULL) ? 0 : -ERESTART;

But it's really up to you.

> @@ -5017,7 +5042,7 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
>          break;
>  
>      case HVMOP_flush_tlbs:
> -        rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -EINVAL;
> +        rc = hvmop_flush_tlb(guest_handle_cast(arg, xen_hvm_flush_tlbs_t));

There's nothing to be passed back so maybe you could even cast to a
const handle here?

> --- a/xen/include/public/hvm/hvm_op.h
> +++ b/xen/include/public/hvm/hvm_op.h
> @@ -99,8 +99,29 @@ DEFINE_XEN_GUEST_HANDLE(xen_hvm_set_pci_link_route_t);
>  
>  #endif /* __XEN_INTERFACE_VERSION__ < 0x00040900 */
>  
> -/* Flushes all VCPU TLBs: @arg must be NULL. */
> +/*
> + * Flushes all VCPU TLBs: @arg can be NULL or xen_hvm_flush_tlbs_t.
> + *
> + * Support for passing a xen_hvm_flush_tlbs_t parameter is signaled in CPUID,
> + * see XEN_HVM_CPUID_ADVANCED_FLUSH.
> + */
>  #define HVMOP_flush_tlbs          5
> +struct xen_hvm_flush_tlbs {
> +    /* Virtual address to be flushed. */
> +    uint64_t va;
> +    uint16_t order;
> +    uint16_t flags;
> +/* Flush global mappings. */
> +#define HVMOP_flush_global      (1u << 0)
> +/* VA for the flush has a valid mapping. */
> +#define HVMOP_flush_va_valid    (1u << 1)
> +    /* Number of uint64_t elements in vcpu_mask. */
> +    uint32_t mask_size;
> +    /* Bitmask of vcpus that should be flushed. */
> +    XEN_GUEST_HANDLE(const_uint64) vcpu_mask;

This will make the structure have different size for 64- and 32-bit
callers. Apart from altp2m interfaces there's no precedent of a
handle in the hvmop structures, so I wonder whether this wouldn't
better be replaced, e.g. by having an effectively variable size
array at the end of the struct. Simply forcing suitable padding,
e.g. by using uint64_aligned_t for the first field, won't help, as
this would lead to 32-bit callers not having suitable control over
the upper 32 bits of what Xen will take as the handle. (And of
course right now both uint64_aligned_t and XEN_GUEST_HANDLE_64 are
Xen+tools constructs only.)

Jan

Re: [PATCH] x86/hvm: allow for more fine grained assisted flush

Posted by Roger Pau Monné 3 years, 12 months ago

On Thu, Apr 30, 2020 at 11:17:25AM +0200, Roger Pau Monne wrote:
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index 814b7020d8..1d41b6e2ea 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -4011,17 +4011,42 @@ static void hvm_s3_resume(struct domain *d)
>      }
>  }
>  
> -static bool always_flush(void *ctxt, struct vcpu *v)
> +static bool flush_check(void *mask, struct vcpu *v)
>  {
> -    return true;
> +    return mask ? test_bit(v->vcpu_id, (unsigned long *)mask) : true;
>  }
>  
> -static int hvmop_flush_tlb_all(void)
> +static int hvmop_flush_tlb(XEN_GUEST_HANDLE_PARAM(xen_hvm_flush_tlbs_t) uop)
>  {
> +    xen_hvm_flush_tlbs_t op;
> +    DECLARE_BITMAP(mask, HVM_MAX_VCPUS) = { };
> +    bool valid_mask = false;
> +
>      if ( !is_hvm_domain(current->domain) )
>          return -EINVAL;
>  
> -    return paging_flush_tlb(always_flush, NULL) ? 0 : -ERESTART;
> +    if ( !guest_handle_is_null(uop) )
> +    {
> +        if ( copy_from_guest(&op, uop, 1) )
> +            return -EFAULT;
> +
> +        /*
> +         * TODO: implement support for the other features present in
> +         * xen_hvm_flush_tlbs_t: flushing a specific virtual address and not
> +         * flushing global mappings.
> +         */
> +
> +        if ( op.mask_size > ARRAY_SIZE(mask) )

This should also check that the passed in flags are correct, ie:

if ( op.mask_size > ARRAY_SIZE(mask) ||
     (op.flags & ~(HVMOP_flush_global | HVMOP_flush_va_valid)) )
     return -EINVAL;

Roger.