[v1] xen: cache control improvements

[PATCH 9/9] xen/x86: track dirty pCPU caches for a given vCPU

Posted by Roger Pau Monne 5 months, 4 weeks ago

When a guest is allowed access to cache control operations such tracking
prevents having to issue a system-wide cache flush, and rather just flush
the pCPUs where the vCPU has been scheduled since the last flush.

Note that domain-wide flushes accumulate the dirty caches from all the
vCPUs, but clearing the vCPU masks will require pausing all vCPUs, which
seems overkill.  Instead leave the vCPU dirty masks as-is, worse case it
will result in redundant flushes in further calls.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
---
 xen/arch/x86/domain.c             | 43 +++++++++++++++++++++++++++++++
 xen/arch/x86/hvm/hvm.c            |  2 +-
 xen/arch/x86/hvm/mtrr.c           |  2 +-
 xen/arch/x86/hvm/svm/svm.c        |  6 +++--
 xen/arch/x86/hvm/vmx/vmx.c        |  6 +++--
 xen/arch/x86/include/asm/domain.h |  9 +++++++
 xen/arch/x86/mm.c                 | 25 +++++++-----------
 xen/arch/x86/pv/emul-priv-op.c    |  8 ++----
 8 files changed, 73 insertions(+), 28 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index f197dad4c0cd..3d08b829d2db 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -579,6 +579,13 @@ int arch_vcpu_create(struct vcpu *v)
 
         if ( (rc = init_vcpu_msr_policy(v)) )
             goto fail;
+
+        if ( cache_flush_permitted(d) &&
+             !cond_zalloc_cpumask_var(&v->arch.dirty_cache) )
+        {
+            rc = -ENOMEM;
+            goto fail;
+        }
     }
     else if ( (rc = xstate_alloc_save_area(v)) != 0 )
         return rc;
@@ -614,6 +621,7 @@ int arch_vcpu_create(struct vcpu *v)
     vcpu_destroy_fpu(v);
     xfree(v->arch.msrs);
     v->arch.msrs = NULL;
+    FREE_CPUMASK_VAR(v->arch.dirty_cache);
 
     return rc;
 }
@@ -628,6 +636,8 @@ void arch_vcpu_destroy(struct vcpu *v)
     xfree(v->arch.msrs);
     v->arch.msrs = NULL;
 
+    FREE_CPUMASK_VAR(v->arch.dirty_cache);
+
     if ( is_hvm_vcpu(v) )
         hvm_vcpu_destroy(v);
     else
@@ -2018,6 +2028,9 @@ static void __context_switch(void)
         cpumask_set_cpu(cpu, nd->dirty_cpumask);
     write_atomic(&n->dirty_cpu, cpu);
 
+    if ( cache_flush_permitted(nd) )
+        __cpumask_set_cpu(cpu, n->arch.dirty_cache);
+
     if ( !is_idle_domain(nd) )
     {
         memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES);
@@ -2606,6 +2619,36 @@ unsigned int domain_max_paddr_bits(const struct domain *d)
     return bits;
 }
 
+void vcpu_flush_cache(struct vcpu *curr)
+{
+    ASSERT(curr == current);
+    ASSERT(cache_flush_permitted(curr->domain));
+
+    flush_mask(curr->arch.dirty_cache, FLUSH_CACHE);
+    cpumask_clear(curr->arch.dirty_cache);
+    __cpumask_set_cpu(smp_processor_id(), curr->arch.dirty_cache);
+}
+
+void domain_flush_cache(const struct domain *d)
+{
+    const struct vcpu *v;
+    cpumask_t *mask = this_cpu(scratch_cpumask);
+
+    ASSERT(cache_flush_permitted(d));
+
+    cpumask_clear(mask);
+    for_each_vcpu( d, v )
+        cpumask_or(mask, mask, v->arch.dirty_cache);
+
+    flush_mask(mask, FLUSH_CACHE);
+    /*
+     * Clearing the mask of vCPUs in the domain would be racy unless all vCPUs
+     * are paused, so just leave them as-is, at the cost of possibly doing
+     * redundant flushes in later calls.  It's still better than doing a
+     * host-wide cache flush.
+     */
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 4cb2e13046d1..aed582a215a0 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2277,7 +2277,7 @@ void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value)
             domain_pause_nosync(v->domain);
 
             /* Flush physical caches. */
-            flush_all(FLUSH_CACHE);
+            domain_flush_cache(v->domain);
             hvm_set_uc_mode(v, 1);
 
             domain_unpause(v->domain);
diff --git a/xen/arch/x86/hvm/mtrr.c b/xen/arch/x86/hvm/mtrr.c
index 887994d2b984..cfe0d44459c2 100644
--- a/xen/arch/x86/hvm/mtrr.c
+++ b/xen/arch/x86/hvm/mtrr.c
@@ -769,7 +769,7 @@ void memory_type_changed(struct domain *d)
     if ( cache_flush_permitted(d) &&
          d->vcpu && d->vcpu[0] && p2m_memory_type_changed(d) )
     {
-        flush_all(FLUSH_CACHE);
+        domain_flush_cache(d);
     }
 }
 
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index e33a38c1e446..5d1777ace335 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -2315,8 +2315,10 @@ static void svm_vmexit_mce_intercept(
 
 static void cf_check svm_wbinvd_intercept(void)
 {
-    if ( cache_flush_permitted(current->domain) )
-        flush_all(FLUSH_CACHE);
+    struct vcpu *curr = current;
+
+    if ( cache_flush_permitted(curr->domain) )
+        vcpu_flush_cache(curr);
 }
 
 static void svm_vmexit_do_invalidate_cache(struct cpu_user_regs *regs,
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 639882ceb216..9273607d576c 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3840,11 +3840,13 @@ static void vmx_do_extint(struct cpu_user_regs *regs)
 
 static void cf_check vmx_wbinvd_intercept(void)
 {
-    if ( !cache_flush_permitted(current->domain) )
+    struct vcpu *curr = current;
+
+    if ( !cache_flush_permitted(curr->domain) )
         return;
 
     if ( cpu_has_wbinvd_exiting )
-        flush_all(FLUSH_CACHE);
+        vcpu_flush_cache(curr);
     else
         wbinvd();
 }
diff --git a/xen/arch/x86/include/asm/domain.h b/xen/arch/x86/include/asm/domain.h
index 8c0dea12a526..064b51889dc2 100644
--- a/xen/arch/x86/include/asm/domain.h
+++ b/xen/arch/x86/include/asm/domain.h
@@ -668,6 +668,12 @@ struct arch_vcpu
 
     struct vcpu_msrs *msrs;
 
+    /*
+     * When vCPU is allowed cache control track the pCPUs the vCPU has run on
+     * since the last flush.
+     */
+    cpumask_var_t dirty_cache;
+
     struct {
         bool next_interrupt_enabled;
     } monitor;
@@ -790,6 +796,9 @@ unsigned int domain_max_paddr_bits(const struct domain *d);
 #define arch_init_idle_domain arch_init_idle_domain
 void arch_init_idle_domain(struct domain *d);
 
+void vcpu_flush_cache(struct vcpu *curr);
+void domain_flush_cache(const struct domain *d);
+
 #endif /* __ASM_DOMAIN_H__ */
 
 /*
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 59b60b1e62a7..11b59398a2c4 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -3804,26 +3804,19 @@ long do_mmuext_op(
             break;
 
         case MMUEXT_FLUSH_CACHE:
-            /*
-             * Dirty pCPU caches where the current vCPU has been scheduled are
-             * not tracked, and hence we need to resort to a global cache
-             * flush for correctness.
-             */
+            if ( unlikely(currd != pg_owner) )
+                rc = -EPERM;
+            else if ( likely(cache_flush_permitted(currd)) )
+                vcpu_flush_cache(curr);
+            else
+                rc = -EINVAL;
+            break;
+
         case MMUEXT_FLUSH_CACHE_GLOBAL:
             if ( unlikely(currd != pg_owner) )
                 rc = -EPERM;
             else if ( likely(cache_flush_permitted(currd)) )
-            {
-                unsigned int cpu;
-                cpumask_t *mask = this_cpu(scratch_cpumask);
-
-                cpumask_clear(mask);
-                for_each_online_cpu(cpu)
-                    if ( !cpumask_intersects(mask,
-                                             per_cpu(cpu_sibling_mask, cpu)) )
-                        __cpumask_set_cpu(cpu, mask);
-                flush_mask(mask, FLUSH_CACHE);
-            }
+                domain_flush_cache(currd);
             else
                 rc = -EINVAL;
             break;
diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index 089d4cb4d905..076ce8f00457 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -1199,12 +1199,8 @@ static int cf_check cache_op(
      * newer linux uses this in some start-of-day timing loops.
      */
     if ( cache_flush_permitted(current->domain) )
-        /*
-         * Handle wbnoinvd as wbinvd, at the expense of higher cost.  Broadcast
-         * the flush to all pCPUs, Xen doesn't track where the vCPU has ran
-         * previously.
-         */
-        flush_all(FLUSH_CACHE);
+        /* Handle wbnoinvd as wbinvd, at the expense of higher cost. */
+        vcpu_flush_cache(current);
 
     return X86EMUL_OKAY;
 }
-- 
2.48.1

Re: [PATCH 9/9] xen/x86: track dirty pCPU caches for a given vCPU

Posted by Jan Beulich 5 months, 3 weeks ago

On 06.05.2025 10:31, Roger Pau Monne wrote:
> @@ -2606,6 +2619,36 @@ unsigned int domain_max_paddr_bits(const struct domain *d)
>      return bits;
>  }
>  
> +void vcpu_flush_cache(struct vcpu *curr)
> +{
> +    ASSERT(curr == current);
> +    ASSERT(cache_flush_permitted(curr->domain));
> +
> +    flush_mask(curr->arch.dirty_cache, FLUSH_CACHE);
> +    cpumask_clear(curr->arch.dirty_cache);

While here re-ordering of the two operations would be merely for (kind of)
doc purposes, ...

> +    __cpumask_set_cpu(smp_processor_id(), curr->arch.dirty_cache);
> +}
> +
> +void domain_flush_cache(const struct domain *d)
> +{
> +    const struct vcpu *v;
> +    cpumask_t *mask = this_cpu(scratch_cpumask);
> +
> +    ASSERT(cache_flush_permitted(d));
> +
> +    cpumask_clear(mask);
> +    for_each_vcpu( d, v )
> +        cpumask_or(mask, mask, v->arch.dirty_cache);
> +
> +    flush_mask(mask, FLUSH_CACHE);
> +    /*
> +     * Clearing the mask of vCPUs in the domain would be racy unless all vCPUs
> +     * are paused, so just leave them as-is, at the cost of possibly doing
> +     * redundant flushes in later calls.  It's still better than doing a
> +     * host-wide cache flush.
> +     */

... wouldn't clearing before flushing avoid the raciness here?

> +}

Also imo both functions are a better fit in mm.c.

Jan

Re: [PATCH 9/9] xen/x86: track dirty pCPU caches for a given vCPU

Posted by Andrew Cooper 5 months, 4 weeks ago

On 06/05/2025 9:31 am, Roger Pau Monne wrote:
> When a guest is allowed access to cache control operations such tracking
> prevents having to issue a system-wide cache flush, and rather just flush
> the pCPUs where the vCPU has been scheduled since the last flush.
>
> Note that domain-wide flushes accumulate the dirty caches from all the
> vCPUs, but clearing the vCPU masks will require pausing all vCPUs, which
> seems overkill.  Instead leave the vCPU dirty masks as-is, worse case it
> will result in redundant flushes in further calls.
>
> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>

I'm afraid this doesn't work.

Unlike TLBs, dirty cacheline can move sideways, e.g. by foreign or grant
mapping, but also naturally because of how cache coherency works.

We need to use the guarantees given to us by the architecture to simply
nop out cache flushes when safe to do so.

Everything else is either a shootdown (clflush/opt/clwb, and doesn't
even trap to Xen), or needs to be a global WB{NO,}INVD.  Partial WBINVDs
are of no value.

~Andrew

Re: [PATCH 9/9] xen/x86: track dirty pCPU caches for a given vCPU

Posted by Roger Pau Monné 5 months, 4 weeks ago

On Tue, May 06, 2025 at 12:16:00PM +0100, Andrew Cooper wrote:
> On 06/05/2025 9:31 am, Roger Pau Monne wrote:
> > When a guest is allowed access to cache control operations such tracking
> > prevents having to issue a system-wide cache flush, and rather just flush
> > the pCPUs where the vCPU has been scheduled since the last flush.
> >
> > Note that domain-wide flushes accumulate the dirty caches from all the
> > vCPUs, but clearing the vCPU masks will require pausing all vCPUs, which
> > seems overkill.  Instead leave the vCPU dirty masks as-is, worse case it
> > will result in redundant flushes in further calls.
> >
> > Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> 
> I'm afraid this doesn't work.
> 
> Unlike TLBs, dirty cacheline can move sideways, e.g. by foreign or grant
> mapping, but also naturally because of how cache coherency works.

Does such sideway moving also imply that local WB{NO,}INVD on native
could be equally bogus?

According to the SDM, cache lines can indeed move between processor
caches, but the memory controller must always snoop such moves and
flush the data to memory:

"Here, the processor with the valid data may pass the data to the
other processors without actually writing it to system memory;
however, it is the responsibility of the memory controller to snoop
this operation and update memory."

So a cache line moving sideways will always be propagated to memory as
part of the move, and hence the data in the previous pCPU cache will
always hit memory.

grants/foreign maps are indeed complex, but sharing non-coherent
across domains seems like a recipe for disaster.  It's maybe mitigated
by doing host-wide cache flushes, but how does the mapping domain know
whether  source domain has possibly dirty caches that need flushing?
IMO it's the source that would need to first flush any cache contents,
and then share the memory.

FWIW, (and not saying this is correct), but KVM uses the same model of
tracking dirty caches, see wbinvd_dirty_mask field in struct
kvm_vcpu_arch.

> We need to use the guarantees given to us by the architecture to simply
> nop out cache flushes when safe to do so.

We already do this when possible AFAICT.

> Everything else is either a shootdown (clflush/opt/clwb, and doesn't
> even trap to Xen), or needs to be a global WB{NO,}INVD.  Partial WBINVDs
> are of no value.

What about on Intel when there's no capability to trap WBINVD?  Xen is
currently flushing the cache of the previous pCPU in case the vCPU has
moved around, see vmx_do_resume().

Thanks, Roger.

Re: [PATCH 9/9] xen/x86: track dirty pCPU caches for a given vCPU

Posted by Jan Beulich 5 months, 3 weeks ago

On 06.05.2025 14:55, Roger Pau Monné wrote:
> On Tue, May 06, 2025 at 12:16:00PM +0100, Andrew Cooper wrote:
>> On 06/05/2025 9:31 am, Roger Pau Monne wrote:
>>> When a guest is allowed access to cache control operations such tracking
>>> prevents having to issue a system-wide cache flush, and rather just flush
>>> the pCPUs where the vCPU has been scheduled since the last flush.
>>>
>>> Note that domain-wide flushes accumulate the dirty caches from all the
>>> vCPUs, but clearing the vCPU masks will require pausing all vCPUs, which
>>> seems overkill.  Instead leave the vCPU dirty masks as-is, worse case it
>>> will result in redundant flushes in further calls.
>>>
>>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>>
>> I'm afraid this doesn't work.
>>
>> Unlike TLBs, dirty cacheline can move sideways, e.g. by foreign or grant
>> mapping, but also naturally because of how cache coherency works.
> 
> Does such sideway moving also imply that local WB{NO,}INVD on native
> could be equally bogus?
> 
> According to the SDM, cache lines can indeed move between processor
> caches, but the memory controller must always snoop such moves and
> flush the data to memory:
> 
> "Here, the processor with the valid data may pass the data to the
> other processors without actually writing it to system memory;
> however, it is the responsibility of the memory controller to snoop
> this operation and update memory."
> 
> So a cache line moving sideways will always be propagated to memory as
> part of the move, and hence the data in the previous pCPU cache will
> always hit memory.

But that's only one of the two aspects of a flush. The other is to ensure
respective data isn't in any (covered) cache anymore. IOW dirty-ness (as
the title has it) isn't a criteria, unless of course you mean "dirty" in
a sense different from what it means in the cache coherency model.

Jan

Re: [PATCH 9/9] xen/x86: track dirty pCPU caches for a given vCPU

Posted by Roger Pau Monné 5 months, 2 weeks ago

On Mon, May 12, 2025 at 05:38:07PM +0200, Jan Beulich wrote:
> On 06.05.2025 14:55, Roger Pau Monné wrote:
> > On Tue, May 06, 2025 at 12:16:00PM +0100, Andrew Cooper wrote:
> >> On 06/05/2025 9:31 am, Roger Pau Monne wrote:
> >>> When a guest is allowed access to cache control operations such tracking
> >>> prevents having to issue a system-wide cache flush, and rather just flush
> >>> the pCPUs where the vCPU has been scheduled since the last flush.
> >>>
> >>> Note that domain-wide flushes accumulate the dirty caches from all the
> >>> vCPUs, but clearing the vCPU masks will require pausing all vCPUs, which
> >>> seems overkill.  Instead leave the vCPU dirty masks as-is, worse case it
> >>> will result in redundant flushes in further calls.
> >>>
> >>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
> >>
> >> I'm afraid this doesn't work.
> >>
> >> Unlike TLBs, dirty cacheline can move sideways, e.g. by foreign or grant
> >> mapping, but also naturally because of how cache coherency works.
> > 
> > Does such sideway moving also imply that local WB{NO,}INVD on native
> > could be equally bogus?
> > 
> > According to the SDM, cache lines can indeed move between processor
> > caches, but the memory controller must always snoop such moves and
> > flush the data to memory:
> > 
> > "Here, the processor with the valid data may pass the data to the
> > other processors without actually writing it to system memory;
> > however, it is the responsibility of the memory controller to snoop
> > this operation and update memory."
> > 
> > So a cache line moving sideways will always be propagated to memory as
> > part of the move, and hence the data in the previous pCPU cache will
> > always hit memory.
> 
> But that's only one of the two aspects of a flush. The other is to ensure
> respective data isn't in any (covered) cache anymore. IOW dirty-ness (as
> the title has it) isn't a criteria, unless of course you mean "dirty" in
> a sense different from what it means in the cache coherency model.

Given the direct map, and the fact that the CPU can speculatively load
entries in the cache, I'm not sure there's much Xen can effectively do
ATM to ensure a certain cache line it's not in any cache anymore.

It would possibly get better if/when the direct map is removed, but
even then Xen (or dom0) might map guest memory for emulation or IO
purposes.  Then Xen/dom0 would need to issue a wbinvd after unmapping
the memory, to ensure the cache is clean in case a vCPU from a domain
is scheduled there.

IMO being realistic it is very unlikely for Xen to be able to ensure
that after a guest wbinvd there are no guest owned cache lines in any
CPU cache, even if such wbinvd is propagated to all host CPUs.

Thanks, Roger.

Re: [PATCH 9/9] xen/x86: track dirty pCPU caches for a given vCPU

Posted by Jan Beulich 5 months, 2 weeks ago

On 15.05.2025 12:52, Roger Pau Monné wrote:
> On Mon, May 12, 2025 at 05:38:07PM +0200, Jan Beulich wrote:
>> On 06.05.2025 14:55, Roger Pau Monné wrote:
>>> On Tue, May 06, 2025 at 12:16:00PM +0100, Andrew Cooper wrote:
>>>> On 06/05/2025 9:31 am, Roger Pau Monne wrote:
>>>>> When a guest is allowed access to cache control operations such tracking
>>>>> prevents having to issue a system-wide cache flush, and rather just flush
>>>>> the pCPUs where the vCPU has been scheduled since the last flush.
>>>>>
>>>>> Note that domain-wide flushes accumulate the dirty caches from all the
>>>>> vCPUs, but clearing the vCPU masks will require pausing all vCPUs, which
>>>>> seems overkill.  Instead leave the vCPU dirty masks as-is, worse case it
>>>>> will result in redundant flushes in further calls.
>>>>>
>>>>> Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
>>>>
>>>> I'm afraid this doesn't work.
>>>>
>>>> Unlike TLBs, dirty cacheline can move sideways, e.g. by foreign or grant
>>>> mapping, but also naturally because of how cache coherency works.
>>>
>>> Does such sideway moving also imply that local WB{NO,}INVD on native
>>> could be equally bogus?
>>>
>>> According to the SDM, cache lines can indeed move between processor
>>> caches, but the memory controller must always snoop such moves and
>>> flush the data to memory:
>>>
>>> "Here, the processor with the valid data may pass the data to the
>>> other processors without actually writing it to system memory;
>>> however, it is the responsibility of the memory controller to snoop
>>> this operation and update memory."
>>>
>>> So a cache line moving sideways will always be propagated to memory as
>>> part of the move, and hence the data in the previous pCPU cache will
>>> always hit memory.
>>
>> But that's only one of the two aspects of a flush. The other is to ensure
>> respective data isn't in any (covered) cache anymore. IOW dirty-ness (as
>> the title has it) isn't a criteria, unless of course you mean "dirty" in
>> a sense different from what it means in the cache coherency model.
> 
> Given the direct map, and the fact that the CPU can speculatively load
> entries in the cache, I'm not sure there's much Xen can effectively do
> ATM to ensure a certain cache line it's not in any cache anymore.
> 
> It would possibly get better if/when the direct map is removed, but
> even then Xen (or dom0) might map guest memory for emulation or IO
> purposes.  Then Xen/dom0 would need to issue a wbinvd after unmapping
> the memory, to ensure the cache is clean in case a vCPU from a domain
> is scheduled there.
> 
> IMO being realistic it is very unlikely for Xen to be able to ensure
> that after a guest wbinvd there are no guest owned cache lines in any
> CPU cache, even if such wbinvd is propagated to all host CPUs.

Well, see Andrew's reply on one of the two "restrict guest-induced WBINVD"
of mine. What you say is effectively supporting the statement I make in
the descriptions there ("We allow its use for writeback purposes only
anyway, ..."). Which Andrew says is wrong for at least future use cases,
possibly even today's. IOW I think we first need to find some common
grounds on the underlying goals / principles.

Jan