[PATCH v2] x86/hvm: Remove callback from paging->flush_tlb() hook

Andrew Cooper posted 1 patch 2 weeks, 3 days ago
Test gitlab-ci failed
Patches applied successfully (tree, apply log)
git fetch https://gitlab.com/xen-project/patchew/xen tags/patchew/20211118184856.20229-1-andrew.cooper3@citrix.com
xen/arch/x86/hvm/hvm.c               |  7 +------
xen/arch/x86/hvm/viridian/viridian.c | 34 +++++++++++-----------------------
xen/arch/x86/mm/hap/hap.c            | 11 ++++++++---
xen/arch/x86/mm/shadow/common.c      | 18 +++++++++++-------
xen/arch/x86/mm/shadow/private.h     |  3 +--
xen/include/asm-x86/paging.h         | 11 ++++-------
6 files changed, 36 insertions(+), 48 deletions(-)

[PATCH v2] x86/hvm: Remove callback from paging->flush_tlb() hook

Posted by Andrew Cooper 2 weeks, 3 days ago
TLB flushing is a hotpath, and function pointer calls are
expensive (especially under repoline) for what amounts to an identity
transform on the data.  Just pass the vcpu_bitmap bitmap directly.

As we use NULL for all rather than none, introduce a flush_vcpu() helper to
avoid the risk of logical errors from opencoding the expression.  This also
means the viridian callers can avoid writing an all-ones bitmap for the
flushing logic to consume.

No functional change.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
CC: Paul Durrant <paul@xen.org>

v2:
 * Also short-circuit the hvcall_flush_ex() HV_GENERIC_SET_ALL path.

The result even compiles smaller:

  $ ../scripts/bloat-o-meter xen-syms-before xen-syms-after
  add/remove: 0/2 grow/shrink: 0/3 up/down: 0/-125 (-125)
  Function                                     old     new   delta
  flush_tlb                                    207     205      -2
  always_flush                                   6       -      -6
  need_flush                                     9       -      -9
  do_hvm_op                                   2434    2418     -16
  shadow_flush_tlb                             628     536     -92

but this is very much not the point of the patch.

It would be rather more efficient in this case and probably others for
for_each_vcpu() to iterate over d->vcpu[] than follow the v->next_vcpu pointer
chain (better locality of access), and also because then the vCPU id is the
loop induction variable, not a value read from memory.
---
 xen/arch/x86/hvm/hvm.c               |  7 +------
 xen/arch/x86/hvm/viridian/viridian.c | 34 +++++++++++-----------------------
 xen/arch/x86/mm/hap/hap.c            | 11 ++++++++---
 xen/arch/x86/mm/shadow/common.c      | 18 +++++++++++-------
 xen/arch/x86/mm/shadow/private.h     |  3 +--
 xen/include/asm-x86/paging.h         | 11 ++++-------
 6 files changed, 36 insertions(+), 48 deletions(-)

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index eee365711d63..31e9474db093 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -4047,17 +4047,12 @@ static void hvm_s3_resume(struct domain *d)
     }
 }
 
-static bool always_flush(void *ctxt, struct vcpu *v)
-{
-    return true;
-}
-
 static int hvmop_flush_tlb_all(void)
 {
     if ( !is_hvm_domain(current->domain) )
         return -EINVAL;
 
-    return paging_flush_tlb(always_flush, NULL) ? 0 : -ERESTART;
+    return paging_flush_tlb(NULL) ? 0 : -ERESTART;
 }
 
 static int hvmop_set_evtchn_upcall_vector(
diff --git a/xen/arch/x86/hvm/viridian/viridian.c b/xen/arch/x86/hvm/viridian/viridian.c
index b906f7b86a74..51d50e194e6d 100644
--- a/xen/arch/x86/hvm/viridian/viridian.c
+++ b/xen/arch/x86/hvm/viridian/viridian.c
@@ -574,13 +574,6 @@ static void vpmask_fill(struct hypercall_vpmask *vpmask)
     bitmap_fill(vpmask->mask, HVM_MAX_VCPUS);
 }
 
-static bool vpmask_test(const struct hypercall_vpmask *vpmask,
-                        unsigned int vp)
-{
-    ASSERT(vp < HVM_MAX_VCPUS);
-    return test_bit(vp, vpmask->mask);
-}
-
 static unsigned int vpmask_first(const struct hypercall_vpmask *vpmask)
 {
     return find_first_bit(vpmask->mask, HVM_MAX_VCPUS);
@@ -669,17 +662,6 @@ static uint16_t hv_vpset_to_vpmask(const struct hv_vpset *set,
 #undef NR_VPS_PER_BANK
 }
 
-/*
- * Windows should not issue the hypercalls requiring this callback in the
- * case where vcpu_id would exceed the size of the mask.
- */
-static bool need_flush(void *ctxt, struct vcpu *v)
-{
-    struct hypercall_vpmask *vpmask = ctxt;
-
-    return vpmask_test(vpmask, v->vcpu_id);
-}
-
 union hypercall_input {
     uint64_t raw;
     struct {
@@ -714,6 +696,7 @@ static int hvcall_flush(const union hypercall_input *input,
         uint64_t flags;
         uint64_t vcpu_mask;
     } input_params;
+    unsigned long *vcpu_bitmap;
 
     /* These hypercalls should never use the fast-call convention. */
     if ( input->fast )
@@ -730,18 +713,19 @@ static int hvcall_flush(const union hypercall_input *input,
      * so err on the safe side.
      */
     if ( input_params.flags & HV_FLUSH_ALL_PROCESSORS )
-        vpmask_fill(vpmask);
+        vcpu_bitmap = NULL;
     else
     {
         vpmask_empty(vpmask);
         vpmask_set(vpmask, 0, input_params.vcpu_mask);
+        vcpu_bitmap = vpmask->mask;
     }
 
     /*
      * A false return means that another vcpu is currently trying
      * a similar operation, so back off.
      */
-    if ( !paging_flush_tlb(need_flush, vpmask) )
+    if ( !paging_flush_tlb(vcpu_bitmap) )
         return -ERESTART;
 
     output->rep_complete = input->rep_count;
@@ -760,6 +744,7 @@ static int hvcall_flush_ex(const union hypercall_input *input,
         uint64_t flags;
         struct hv_vpset set;
     } input_params;
+    unsigned long *vcpu_bitmap;
 
     /* These hypercalls should never use the fast-call convention. */
     if ( input->fast )
@@ -770,8 +755,9 @@ static int hvcall_flush_ex(const union hypercall_input *input,
                                   sizeof(input_params)) != HVMTRANS_okay )
         return -EINVAL;
 
-    if ( input_params.flags & HV_FLUSH_ALL_PROCESSORS )
-        vpmask_fill(vpmask);
+    if ( input_params.flags & HV_FLUSH_ALL_PROCESSORS ||
+         input_params.set.format == HV_GENERIC_SET_ALL )
+        vcpu_bitmap = NULL;
     else
     {
         union hypercall_vpset *vpset = &this_cpu(hypercall_vpset);
@@ -807,13 +793,15 @@ static int hvcall_flush_ex(const union hypercall_input *input,
         rc = hv_vpset_to_vpmask(set, vpmask);
         if ( rc )
             return rc;
+
+        vcpu_bitmap = vpmask->mask;
     }
 
     /*
      * A false return means that another vcpu is currently trying
      * a similar operation, so back off.
      */
-    if ( !paging_flush_tlb(need_flush, vpmask) )
+    if ( !paging_flush_tlb(vcpu_bitmap) )
         return -ERESTART;
 
     output->rep_complete = input->rep_count;
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 5b269ef8b3bb..de4b13565ab4 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -696,8 +696,13 @@ static void hap_update_cr3(struct vcpu *v, int do_locking, bool noflush)
     hvm_update_guest_cr3(v, noflush);
 }
 
-static bool flush_tlb(bool (*flush_vcpu)(void *ctxt, struct vcpu *v),
-                      void *ctxt)
+static bool flush_vcpu(const struct vcpu *v, const unsigned long *vcpu_bitmap)
+{
+    return !vcpu_bitmap || test_bit(v->vcpu_id, vcpu_bitmap);
+}
+
+/* Flush TLB of selected vCPUs.  NULL for all. */
+static bool flush_tlb(const unsigned long *vcpu_bitmap)
 {
     static DEFINE_PER_CPU(cpumask_t, flush_cpumask);
     cpumask_t *mask = &this_cpu(flush_cpumask);
@@ -712,7 +717,7 @@ static bool flush_tlb(bool (*flush_vcpu)(void *ctxt, struct vcpu *v),
     {
         unsigned int cpu;
 
-        if ( !flush_vcpu(ctxt, v) )
+        if ( !flush_vcpu(v, vcpu_bitmap) )
             continue;
 
         hvm_asid_flush_vcpu(v);
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index 8c1b041f7135..de09ef5cae58 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -3069,9 +3069,13 @@ static void sh_clean_dirty_bitmap(struct domain *d)
 }
 
 
-/* Fluhs TLB of selected vCPUs. */
-bool shadow_flush_tlb(bool (*flush_vcpu)(void *ctxt, struct vcpu *v),
-                      void *ctxt)
+static bool flush_vcpu(const struct vcpu *v, const unsigned long *vcpu_bitmap)
+{
+    return !vcpu_bitmap || test_bit(v->vcpu_id, vcpu_bitmap);
+}
+
+/* Flush TLB of selected vCPUs.  NULL for all. */
+bool shadow_flush_tlb(const unsigned long *vcpu_bitmap)
 {
     static DEFINE_PER_CPU(cpumask_t, flush_cpumask);
     cpumask_t *mask = &this_cpu(flush_cpumask);
@@ -3084,12 +3088,12 @@ bool shadow_flush_tlb(bool (*flush_vcpu)(void *ctxt, struct vcpu *v),
 
     /* Pause all other vcpus. */
     for_each_vcpu ( d, v )
-        if ( v != current && flush_vcpu(ctxt, v) )
+        if ( v != current && flush_vcpu(v, vcpu_bitmap) )
             vcpu_pause_nosync(v);
 
     /* Now that all VCPUs are signalled to deschedule, we wait... */
     for_each_vcpu ( d, v )
-        if ( v != current && flush_vcpu(ctxt, v) )
+        if ( v != current && flush_vcpu(v, vcpu_bitmap) )
             while ( !vcpu_runnable(v) && v->is_running )
                 cpu_relax();
 
@@ -3103,7 +3107,7 @@ bool shadow_flush_tlb(bool (*flush_vcpu)(void *ctxt, struct vcpu *v),
     {
         unsigned int cpu;
 
-        if ( !flush_vcpu(ctxt, v) )
+        if ( !flush_vcpu(v, vcpu_bitmap) )
             continue;
 
         paging_update_cr3(v, false);
@@ -3118,7 +3122,7 @@ bool shadow_flush_tlb(bool (*flush_vcpu)(void *ctxt, struct vcpu *v),
 
     /* Done. */
     for_each_vcpu ( d, v )
-        if ( v != current && flush_vcpu(ctxt, v) )
+        if ( v != current && flush_vcpu(v, vcpu_bitmap) )
             vcpu_unpause(v);
 
     return true;
diff --git a/xen/arch/x86/mm/shadow/private.h b/xen/arch/x86/mm/shadow/private.h
index 35efb1b984fb..e4db8d32546a 100644
--- a/xen/arch/x86/mm/shadow/private.h
+++ b/xen/arch/x86/mm/shadow/private.h
@@ -922,8 +922,7 @@ static inline int sh_check_page_has_no_refs(struct page_info *page)
 }
 
 /* Flush the TLB of the selected vCPUs. */
-bool shadow_flush_tlb(bool (*flush_vcpu)(void *ctxt, struct vcpu *v),
-                      void *ctxt);
+bool shadow_flush_tlb(const unsigned long *vcpu_bitmap);
 
 #endif /* _XEN_SHADOW_PRIVATE_H */
 
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h
index 996c2cd0383f..308f1115dde9 100644
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -141,9 +141,7 @@ struct paging_mode {
     void          (*update_cr3            )(struct vcpu *v, int do_locking,
                                             bool noflush);
     void          (*update_paging_modes   )(struct vcpu *v);
-    bool          (*flush_tlb             )(bool (*flush_vcpu)(void *ctxt,
-                                                               struct vcpu *v),
-                                            void *ctxt);
+    bool          (*flush_tlb             )(const unsigned long *vcpu_bitmap);
 
     unsigned int guest_levels;
 
@@ -417,11 +415,10 @@ static always_inline unsigned int paging_max_paddr_bits(const struct domain *d)
     return bits;
 }
 
-static inline bool paging_flush_tlb(bool (*flush_vcpu)(void *ctxt,
-                                                       struct vcpu *v),
-                                    void *ctxt)
+/* Flush selected vCPUs TLBs.  NULL for all. */
+static inline bool paging_flush_tlb(const unsigned long *vcpu_bitmap)
 {
-    return paging_get_hostmode(current)->flush_tlb(flush_vcpu, ctxt);
+    return paging_get_hostmode(current)->flush_tlb(vcpu_bitmap);
 }
 
 #endif /* XEN_PAGING_H */
-- 
2.11.0


Re: [PATCH v2] x86/hvm: Remove callback from paging->flush_tlb() hook

Posted by Durrant, Paul 2 weeks, 3 days ago
On 18/11/2021 18:48, Andrew Cooper wrote:
> TLB flushing is a hotpath, and function pointer calls are
> expensive (especially under repoline) for what amounts to an identity
> transform on the data.  Just pass the vcpu_bitmap bitmap directly.
> 
> As we use NULL for all rather than none, introduce a flush_vcpu() helper to
> avoid the risk of logical errors from opencoding the expression.  This also
> means the viridian callers can avoid writing an all-ones bitmap for the
> flushing logic to consume.
> 
> No functional change.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Paul Durrant <paul@xen.org>