[Xen-devel] [PATCH v2 0/9] XSA-292 follow-up

Jan Beulich posted 9 patches 4 years, 7 months ago
Only 0 patches received!
[Xen-devel] [PATCH v2 0/9] XSA-292 follow-up
Posted by Jan Beulich 4 years, 7 months ago
Various CR3 and PCID related adjustments, first and foremost an
almost full re-write of switch_cr3_cr4() (in patch 2).

1: x86: adjust cr3_pcid() return type
2: x86: limit the amount of TLB flushing in switch_cr3_cr4()
3: x86/mm: honor opt_pcid also for 32-bit PV domains
4: x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
5: x86/HVM: refuse CR3 loads with reserved (upper) bits set
6: x86/HVM: relax shadow mode check in hvm_set_cr3()
7: x86/HVM: cosmetics to hvm_set_cr3()
8: x86/CPUID: drop INVPCID dependency on PCID
9: x86: PCID is unused when !PV

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 1/9] x86: adjust cr3_pcid() return type
Posted by Jan Beulich 4 years, 7 months ago
There's no need for it to be 64 bits wide - only the low twelve bits
of CR3 hold the PCID.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -103,7 +103,8 @@ static void do_tlb_flush(void)
 
 void switch_cr3_cr4(unsigned long cr3, unsigned long cr4)
 {
-    unsigned long flags, old_cr4, old_pcid;
+    unsigned long flags, old_cr4;
+    unsigned int old_pcid;
     u32 t;
 
     /* This non-reentrant function is sometimes called in interrupt context. */
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -287,7 +287,7 @@ static inline unsigned long cr3_pa(unsig
     return cr3 & X86_CR3_ADDR_MASK;
 }
 
-static inline unsigned long cr3_pcid(unsigned long cr3)
+static inline unsigned int cr3_pcid(unsigned long cr3)
 {
     return cr3 & X86_CR3_PCID_MASK;
 }


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 1/9] x86: adjust cr3_pcid() return type
Posted by Andrew Cooper 4 years, 7 months ago
On 17/09/2019 07:13, Jan Beulich wrote:
> There's no need for it to be 64 bits wide - only the low twelve bits
> of CR3 hold the PCID.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 2/9] x86: limit the amount of TLB flushing in switch_cr3_cr4()
Posted by Jan Beulich 4 years, 7 months ago
We really need to flush the TLB just once, if we do so with or after the
CR3 write. The only case where two flushes are unavoidable is when we
mean to turn off CR4.PGE (perhaps just temporarily; see the code
comment).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -104,82 +104,65 @@ static void do_tlb_flush(void)
 void switch_cr3_cr4(unsigned long cr3, unsigned long cr4)
 {
     unsigned long flags, old_cr4;
-    unsigned int old_pcid;
     u32 t;
 
+    /* Throughout this function we make this assumption: */
+    ASSERT(!(cr4 & X86_CR4_PCIDE) || !(cr4 & X86_CR4_PGE));
+
     /* This non-reentrant function is sometimes called in interrupt context. */
     local_irq_save(flags);
 
     t = pre_flush();
 
     old_cr4 = read_cr4();
-    if ( old_cr4 & X86_CR4_PGE )
+    ASSERT(!(old_cr4 & X86_CR4_PCIDE) || !(old_cr4 & X86_CR4_PGE));
+
+    /*
+     * We need to write CR4 before CR3 if we're about to enable PCIDE, at the
+     * very least when the new PCID is non-zero.
+     *
+     * As we also need to do two CR4 writes in total when PGE is enabled and
+     * is to remain enabled, do the one temporarily turning off the bit right
+     * here as well.
+     *
+     * The only TLB flushing effect we depend on here is in case we move from
+     * PGE set to PCIDE set, where we want global page entries gone (and none
+     * to re-appear) after this write.
+     */
+    if ( !(old_cr4 & X86_CR4_PCIDE) &&
+         ((cr4 & X86_CR4_PCIDE) || (cr4 & old_cr4 & X86_CR4_PGE)) )
     {
-        /*
-         * X86_CR4_PGE set means PCID is inactive.
-         * We have to purge the TLB via flipping cr4.pge.
-         */
         old_cr4 = cr4 & ~X86_CR4_PGE;
         write_cr4(old_cr4);
     }
-    else if ( use_invpcid )
-    {
-        /*
-         * Flushing the TLB via INVPCID is necessary only in case PCIDs are
-         * in use, which is true only with INVPCID being available.
-         * Without PCID usage the following write_cr3() will purge the TLB
-         * (we are in the cr4.pge off path) of all entries.
-         * Using invpcid_flush_all_nonglobals() seems to be faster than
-         * invpcid_flush_all(), so use that.
-         */
-        invpcid_flush_all_nonglobals();
-
-        /*
-         * CR4.PCIDE needs to be set before the CR3 write below. Otherwise
-         * - the CR3 write will fault when CR3.NOFLUSH is set (which is the
-         *   case normally),
-         * - the subsequent CR4 write will fault if CR3.PCID != 0.
-         */
-        if ( (old_cr4 & X86_CR4_PCIDE) < (cr4 & X86_CR4_PCIDE) )
-        {
-            write_cr4(cr4);
-            old_cr4 = cr4;
-        }
-    }
 
     /*
-     * If we don't change PCIDs, the CR3 write below needs to flush this very
-     * PCID, even when a full flush was performed above, as we are currently
-     * accumulating TLB entries again from the old address space.
-     * NB: Clearing the bit when we don't use PCID is benign (as it is clear
-     * already in that case), but allows the if() to be more simple.
+     * If the CR4 write is to turn off PCIDE, we don't need the CR3 write to
+     * flush anything, as that transition is a full flush itself.
      */
-    old_pcid = cr3_pcid(read_cr3());
-    if ( old_pcid == cr3_pcid(cr3) )
-        cr3 &= ~X86_CR3_NOFLUSH;
-
+    if ( (old_cr4 & X86_CR4_PCIDE) > (cr4 & X86_CR4_PCIDE) )
+        cr3 |= X86_CR3_NOFLUSH;
     write_cr3(cr3);
 
     if ( old_cr4 != cr4 )
         write_cr4(cr4);
 
     /*
-     * Make sure no TLB entries related to the old PCID created between
-     * flushing the TLB and writing the new %cr3 value remain in the TLB.
-     *
-     * The write to CR4 just above has performed a wider flush in certain
-     * cases, which therefore get excluded here. Since that write is
-     * conditional, note in particular that it won't be skipped if PCIDE
-     * transitions from 1 to 0. This is because the CR4 write further up will
-     * have been skipped in this case, as PCIDE and PGE won't both be set at
-     * the same time.
-     *
-     * Note also that PGE is always clear in old_cr4.
+     *  PGE  | PCIDE | flush at
+     * ------+-------+------------------------
+     *  0->0 | 0->0  | CR3 write
+     *  0->0 | 0->1  | n/a (see 1st CR4 write)
+     *  0->x | 1->0  | CR4 write
+     *  x->1 | x->1  | n/a
+     *  0->0 | 1->1  | INVPCID
+     *  0->1 | 0->0  | CR3 and CR4 writes
+     *  1->0 | 0->0  | CR4 write
+     *  1->0 | 0->1  | n/a (see 1st CR4 write)
+     *  1->1 | 0->0  | n/a (see 1st CR4 write)
+     *  1->x | 1->x  | n/a
      */
-    if ( old_pcid != cr3_pcid(cr3) &&
-         !(cr4 & X86_CR4_PGE) &&
-         (old_cr4 & X86_CR4_PCIDE) <= (cr4 & X86_CR4_PCIDE) )
-        invpcid_flush_single_context(old_pcid);
+    if ( cr4 & X86_CR4_PCIDE )
+        invpcid_flush_all_nonglobals();
 
     post_flush(t);
 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 2/9] x86: limit the amount of TLB flushing in switch_cr3_cr4()
Posted by Andrew Cooper 4 years, 7 months ago
On 17/09/2019 07:13, Jan Beulich wrote:
> We really need to flush the TLB just once, if we do so with or after the
> CR3 write. The only case where two flushes are unavoidable is when we
> mean to turn off CR4.PGE (perhaps just temporarily; see the code
> comment).
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

I'm a bit hesitant, considering the two especially subtle TLB flushing
bugs we introduced with the last change to this function, but that alone
isn't a legitimate reason to block the patch.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 3/9] x86/mm: honor opt_pcid also for 32-bit PV domains
Posted by Jan Beulich 4 years, 7 months ago
I can't see any technical or performance reason why we should treat
32-bit PV different from 64-bit PV in this regard.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -180,7 +180,24 @@ int switch_compat(struct domain *d)
     d->arch.x87_fip_width = 4;
 
     d->arch.pv.xpti = false;
-    d->arch.pv.pcid = false;
+
+    if ( use_invpcid && cpu_has_pcid )
+        switch ( ACCESS_ONCE(opt_pcid) )
+        {
+        case PCID_OFF:
+        case PCID_XPTI:
+            d->arch.pv.pcid = false;
+            break;
+
+        case PCID_ALL:
+        case PCID_NOXPTI:
+            d->arch.pv.pcid = true;
+            break;
+
+        default:
+            ASSERT_UNREACHABLE();
+            break;
+        }
 
     return 0;
 
@@ -312,7 +329,7 @@ int pv_domain_initialise(struct domain *
 
     d->arch.pv.xpti = is_hardware_domain(d) ? opt_xpti_hwdom : opt_xpti_domu;
 
-    if ( !is_pv_32bit_domain(d) && use_invpcid && cpu_has_pcid )
+    if ( use_invpcid && cpu_has_pcid )
         switch ( ACCESS_ONCE(opt_pcid) )
         {
         case PCID_OFF:


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 3/9] x86/mm: honor opt_pcid also for 32-bit PV domains
Posted by Andrew Cooper 4 years, 7 months ago
On 17/09/2019 07:14, Jan Beulich wrote:
> I can't see any technical or performance reason why we should treat
> 32-bit PV different from 64-bit PV in this regard.

Well, other than the fact this setting is only read for a 64bit guest...

The reason it isn't set for 32bit guests is that there is no scenario
where we use it.

For PV guests in general, we could consider making a similar
optimisation to Linux and having a LRU list of 4 or so PCIDs to speed up
process context switches, which would be applicable to 32bit guests as
well (so long as someone can reason about the TLB flushing behaviour of
the PV ABI), but this sounds like a large quantity of work.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 3/9] x86/mm: honor opt_pcid also for 32-bit PV domains
Posted by Jan Beulich 4 years, 7 months ago
On 17.09.2019 21:00, Andrew Cooper wrote:
> On 17/09/2019 07:14, Jan Beulich wrote:
>> I can't see any technical or performance reason why we should treat
>> 32-bit PV different from 64-bit PV in this regard.
> 
> Well, other than the fact this setting is only read for a 64bit guest...

How come? make_cr3() uses it uniformly, as does pv_make_cr4().
toggle_guest_mode() is the one case where it's strictly 64-bit
guest only.

> The reason it isn't set for 32bit guests is that there is no scenario
> where we use it.

"pcid=1" and "pcid=noxpti" both are scenarios where, with this
patch in place, we would use it.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 3/9] x86/mm: honor opt_pcid also for 32-bit PV domains
Posted by Andrew Cooper 4 years, 7 months ago
On 18/09/2019 10:22, Jan Beulich wrote:
> On 17.09.2019 21:00, Andrew Cooper wrote:
>> On 17/09/2019 07:14, Jan Beulich wrote:
>>> I can't see any technical or performance reason why we should treat
>>> 32-bit PV different from 64-bit PV in this regard.
>> Well, other than the fact this setting is only read for a 64bit guest...
> How come? make_cr3() uses it uniformly, as does pv_make_cr4().
> toggle_guest_mode() is the one case where it's strictly 64-bit
> guest only.

Oh - you are right.  I don't know how I came to the above conclusion,
but ...

>> The reason it isn't set for 32bit guests is that there is no scenario
>> where we use it.
> "pcid=1" and "pcid=noxpti" both are scenarios where, with this
> patch in place, we would use it.

... I still don't see why this is sensible.

As far as I can tell, all it will do for a 32bit PV guest is start using
2 PCIDs for the same logical gCR3, which will be a net perf it hit for
32bit PV guests.

This is ultimately wrapped up in the confusion over TF_kernel_mode and
v->arch.guest_table{,_user}.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 3/9] x86/mm: honor opt_pcid also for 32-bit PV domains
Posted by Jan Beulich 4 years, 7 months ago
On 18.09.2019 13:55, Andrew Cooper wrote:
> On 18/09/2019 10:22, Jan Beulich wrote:
>> On 17.09.2019 21:00, Andrew Cooper wrote:
>>> On 17/09/2019 07:14, Jan Beulich wrote:
>>>> I can't see any technical or performance reason why we should treat
>>>> 32-bit PV different from 64-bit PV in this regard.
>>> Well, other than the fact this setting is only read for a 64bit guest...
>> How come? make_cr3() uses it uniformly, as does pv_make_cr4().
>> toggle_guest_mode() is the one case where it's strictly 64-bit
>> guest only.
> 
> Oh - you are right.  I don't know how I came to the above conclusion,
> but ...
> 
>>> The reason it isn't set for 32bit guests is that there is no scenario
>>> where we use it.
>> "pcid=1" and "pcid=noxpti" both are scenarios where, with this
>> patch in place, we would use it.
> 
> ... I still don't see why this is sensible.

Whether it's sensible to at least try out I'm not going to judge.
What I find wrong though is that, for no reason, we don't fully
honor the command line option prior to this change.

> As far as I can tell, all it will do for a 32bit PV guest is start using
> 2 PCIDs for the same logical gCR3, which will be a net perf it hit for
> 32bit PV guests.
> 
> This is ultimately wrapped up in the confusion over TF_kernel_mode and
> v->arch.guest_table{,_user}.

Is there still confusion, despite the cleanup done not overly long
ago? TF_kernel_mode is now uniformly set for HVM and 32-bit PV
vCPU-s; only 64-bit PV vCPU-s can have it clear.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
Posted by Jan Beulich 4 years, 7 months ago
The bit is meaningful only for MOV-to-CR3 insns, not anywhere else, in
particular not when loading nested guest state.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -2080,6 +2080,8 @@ static int hvmemul_write_cr(
     HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
     switch ( reg )
     {
+        bool noflush;
+
     case 0:
         rc = hvm_set_cr0(val, true);
         break;
@@ -2090,7 +2092,10 @@ static int hvmemul_write_cr(
         break;
 
     case 3:
-        rc = hvm_set_cr3(val, true);
+        noflush = hvm_pcid_enabled(current) && (val & X86_CR3_NOFLUSH);
+        if ( noflush )
+            val &= ~X86_CR3_NOFLUSH;
+        rc = hvm_set_cr3(val, noflush, true);
         break;
 
     case 4:
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2059,12 +2059,17 @@ int hvm_mov_to_cr(unsigned int cr, unsig
 
     switch ( cr )
     {
+        bool noflush;
+
     case 0:
         rc = hvm_set_cr0(val, true);
         break;
 
     case 3:
-        rc = hvm_set_cr3(val, true);
+        noflush = hvm_pcid_enabled(curr) && (val & X86_CR3_NOFLUSH);
+        if ( noflush )
+            val &= ~X86_CR3_NOFLUSH;
+        rc = hvm_set_cr3(val, noflush, true);
         break;
 
     case 4:
@@ -2282,12 +2287,11 @@ int hvm_set_cr0(unsigned long value, boo
     return X86EMUL_OKAY;
 }
 
-int hvm_set_cr3(unsigned long value, bool may_defer)
+int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
 {
     struct vcpu *v = current;
     struct page_info *page;
     unsigned long old = v->arch.hvm.guest_cr[3];
-    bool noflush = false;
 
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
@@ -2299,17 +2303,12 @@ int hvm_set_cr3(unsigned long value, boo
             /* The actual write will occur in hvm_do_resume(), if permitted. */
             v->arch.vm_event->write_data.do_write.cr3 = 1;
             v->arch.vm_event->write_data.cr3 = value;
+            v->arch.vm_event->write_data.cr3_noflush = noflush;
 
             return X86EMUL_OKAY;
         }
     }
 
-    if ( hvm_pcid_enabled(v) ) /* Clear the noflush bit. */
-    {
-        noflush = value & X86_CR3_NOFLUSH;
-        value &= ~X86_CR3_NOFLUSH;
-    }
-
     if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
          (value != v->arch.hvm.guest_cr[3]) )
     {
@@ -3004,7 +3003,7 @@ void hvm_task_switch(
     if ( task_switch_load_seg(x86_seg_ldtr, tss.ldt, new_cpl, 0) )
         goto out;
 
-    rc = hvm_set_cr3(tss.cr3, true);
+    rc = hvm_set_cr3(tss.cr3, false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
     if ( rc != X86EMUL_OKAY )
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -324,7 +324,7 @@ static int nsvm_vcpu_hostrestore(struct
         v->arch.guest_table = pagetable_null();
         /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
     }
-    rc = hvm_set_cr3(n1vmcb->_cr3, true);
+    rc = hvm_set_cr3(n1vmcb->_cr3, false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
     if (rc != X86EMUL_OKAY)
@@ -584,7 +584,7 @@ static int nsvm_vmcb_prepare4vmrun(struc
         nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
 
         /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
-        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
+        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
         if ( rc == X86EMUL_EXCEPTION )
             hvm_inject_hw_exception(TRAP_gp_fault, 0);
         if (rc != X86EMUL_OKAY)
@@ -598,7 +598,7 @@ static int nsvm_vmcb_prepare4vmrun(struc
          * we assume it intercepts page faults.
          */
         /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
-        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
+        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
         if ( rc == X86EMUL_EXCEPTION )
             hvm_inject_hw_exception(TRAP_gp_fault, 0);
         if (rc != X86EMUL_OKAY)
--- a/xen/arch/x86/hvm/vm_event.c
+++ b/xen/arch/x86/hvm/vm_event.c
@@ -110,7 +110,7 @@ void hvm_vm_event_do_resume(struct vcpu
 
     if ( unlikely(w->do_write.cr3) )
     {
-        if ( hvm_set_cr3(w->cr3, false) == X86EMUL_EXCEPTION )
+        if ( hvm_set_cr3(w->cr3, w->cr3_noflush, false) == X86EMUL_EXCEPTION )
             hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
         w->do_write.cr3 = 0;
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1032,7 +1032,7 @@ static void load_shadow_guest_state(stru
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
-    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), true);
+    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
@@ -1246,7 +1246,7 @@ static void load_vvmcs_host_state(struct
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
-    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), true);
+    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -274,6 +274,8 @@ struct monitor_write_data {
         unsigned int cr4 : 1;
     } do_write;
 
+    bool cr3_noflush;
+
     uint32_t msr;
     uint64_t value;
     uint64_t cr0;
--- a/xen/include/asm-x86/hvm/support.h
+++ b/xen/include/asm-x86/hvm/support.h
@@ -135,7 +135,7 @@ void hvm_shadow_handle_cd(struct vcpu *v
  */
 int hvm_set_efer(uint64_t value);
 int hvm_set_cr0(unsigned long value, bool may_defer);
-int hvm_set_cr3(unsigned long value, bool may_defer);
+int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer);
 int hvm_set_cr4(unsigned long value, bool may_defer);
 int hvm_descriptor_access_intercept(uint64_t exit_info,
                                     uint64_t vmx_exit_qualification,


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
Posted by Andrew Cooper 4 years, 7 months ago
On 17/09/2019 07:15, Jan Beulich wrote:
> The bit is meaningful only for MOV-to-CR3 insns, not anywhere else, in
> particular not when loading nested guest state.

I've found a footnote for "26.3.1.1 Checks on Guest Control Registers,
Debug Registers, and MSRs" stating:

"Bit 63 of the CR3 field in the guest-state area must be 0. This is true
even though, If CR4.PCIDE = 1, bit 63 of the source operand to MOV to
CR3 is used to determine whether cached translation information is
invalidated."

This is fairly clear that attempting to set the NOFLUSH bit in the VMCS
is going to cause problems, and I seem to recall some vmentry failures
reported in the past on the subject.

What it isn't so clear on is whether we will ever find a VMCS with the
NOFLUSH bit set in guest state.  Sadly, the vmexit side of the above
comment (which was for vmentry) simply says "The contents of ... CR3,
... are saved into the corresponding fields."

Any way, I'm in principle ok with the change, except for...

> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -2282,12 +2287,11 @@ int hvm_set_cr0(unsigned long value, boo
>      return X86EMUL_OKAY;
>  }
>  
> -int hvm_set_cr3(unsigned long value, bool may_defer)
> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
>  {
>      struct vcpu *v = current;
>      struct page_info *page;
>      unsigned long old = v->arch.hvm.guest_cr[3];
> -    bool noflush = false;
>  
>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
> @@ -2299,17 +2303,12 @@ int hvm_set_cr3(unsigned long value, boo
>              /* The actual write will occur in hvm_do_resume(), if permitted. */
>              v->arch.vm_event->write_data.do_write.cr3 = 1;
>              v->arch.vm_event->write_data.cr3 = value;
> +            v->arch.vm_event->write_data.cr3_noflush = noflush;

... this, which I'm pretty sure breaks the reporting of noflush mov to
cr3's.  The semantics of the VMI hook is "the guest tried to write this
value to cr3", which should include the noflush bit in its architectural
position.  I suspect it also breaks a reply with the noflush bit set,
because I don't see any way that is fed back from the introspecting agent.

CC'ing the Introspection maintainers for their view.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
Posted by Jan Beulich 4 years, 7 months ago
On 17.09.2019 21:31, Andrew Cooper wrote:
> On 17/09/2019 07:15, Jan Beulich wrote:
>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -2282,12 +2287,11 @@ int hvm_set_cr0(unsigned long value, boo
>>      return X86EMUL_OKAY;
>>  }
>>  
>> -int hvm_set_cr3(unsigned long value, bool may_defer)
>> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
>>  {
>>      struct vcpu *v = current;
>>      struct page_info *page;
>>      unsigned long old = v->arch.hvm.guest_cr[3];
>> -    bool noflush = false;
>>  
>>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
>> @@ -2299,17 +2303,12 @@ int hvm_set_cr3(unsigned long value, boo
>>              /* The actual write will occur in hvm_do_resume(), if permitted. */
>>              v->arch.vm_event->write_data.do_write.cr3 = 1;
>>              v->arch.vm_event->write_data.cr3 = value;
>> +            v->arch.vm_event->write_data.cr3_noflush = noflush;
> 
> ... this, which I'm pretty sure breaks the reporting of noflush mov to
> cr3's.  The semantics of the VMI hook is "the guest tried to write this
> value to cr3", which should include the noflush bit in its architectural
> position.  I suspect it also breaks a reply with the noflush bit set,
> because I don't see any way that is fed back from the introspecting agent.

hvm_monitor_cr() zaps the bit off the reported value. I wonder
whether the patch here should include removing this zapping, as
being redundant now. I don't think though that the patch alters
behavior in this regard.

> CC'ing the Introspection maintainers for their view.

I'll wait for their feedback anyway, before making any possible
change to the patch.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
Posted by Tamas K Lengyel 4 years, 7 months ago
On Wed, Sep 18, 2019 at 3:29 AM Jan Beulich <jbeulich@suse.com> wrote:
>
> On 17.09.2019 21:31, Andrew Cooper wrote:
> > On 17/09/2019 07:15, Jan Beulich wrote:
> >> --- a/xen/arch/x86/hvm/hvm.c
> >> +++ b/xen/arch/x86/hvm/hvm.c
> >> @@ -2282,12 +2287,11 @@ int hvm_set_cr0(unsigned long value, boo
> >>      return X86EMUL_OKAY;
> >>  }
> >>
> >> -int hvm_set_cr3(unsigned long value, bool may_defer)
> >> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
> >>  {
> >>      struct vcpu *v = current;
> >>      struct page_info *page;
> >>      unsigned long old = v->arch.hvm.guest_cr[3];
> >> -    bool noflush = false;
> >>
> >>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
> >>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
> >> @@ -2299,17 +2303,12 @@ int hvm_set_cr3(unsigned long value, boo
> >>              /* The actual write will occur in hvm_do_resume(), if permitted. */
> >>              v->arch.vm_event->write_data.do_write.cr3 = 1;
> >>              v->arch.vm_event->write_data.cr3 = value;
> >> +            v->arch.vm_event->write_data.cr3_noflush = noflush;
> >
> > ... this, which I'm pretty sure breaks the reporting of noflush mov to
> > cr3's.  The semantics of the VMI hook is "the guest tried to write this
> > value to cr3", which should include the noflush bit in its architectural
> > position.  I suspect it also breaks a reply with the noflush bit set,
> > because I don't see any way that is fed back from the introspecting agent.
>
> hvm_monitor_cr() zaps the bit off the reported value. I wonder
> whether the patch here should include removing this zapping, as
> being redundant now. I don't think though that the patch alters
> behavior in this regard.
>
> > CC'ing the Introspection maintainers for their view.
>
> I'll wait for their feedback anyway, before making any possible
> change to the patch.
>

I'm not aware of any use-case of the CR3 introspection that needs to
know whether the noflush bit was set or not. Having it zapped before
the event is sent out simplifies handling of CR3 events because we
don't have to constantly reply with a different cr3 value from the
monitor agent with that bit masked. So unless that changes and we come
up with a usecase that needs it I wouldn't change how things are right
now.

Tamas

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
Posted by Paul Durrant 4 years, 7 months ago
> -----Original Message-----
> From: Xen-devel <xen-devel-bounces@lists.xenproject.org> On Behalf Of Jan Beulich
> Sent: 17 September 2019 07:15
> To: xen-devel@lists.xenproject.org
> Cc: Kevin Tian <kevin.tian@intel.com>; Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>; Wei Liu
> <wl@xen.org>; Paul Durrant <paul@xen.org>; George Dunlap <George.Dunlap@citrix.com>; Andrew Cooper
> <Andrew.Cooper3@citrix.com>; Jun Nakajima <jun.nakajima@intel.com>; Boris Ostrovsky
> <boris.ostrovsky@oracle.com>; Roger Pau Monne <roger.pau@citrix.com>
> Subject: [Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
> 
> The bit is meaningful only for MOV-to-CR3 insns, not anywhere else, in
> particular not when loading nested guest state.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -2080,6 +2080,8 @@ static int hvmemul_write_cr(
>      HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
>      switch ( reg )
>      {
> +        bool noflush;
> +

I said this before... I think the tighter case-scope is better. Cosmetically it may look a little odd, but surely it gives the compiler a better chance to optimize?

  Paul

>      case 0:
>          rc = hvm_set_cr0(val, true);
>          break;
> @@ -2090,7 +2092,10 @@ static int hvmemul_write_cr(
>          break;
> 
>      case 3:
> -        rc = hvm_set_cr3(val, true);
> +        noflush = hvm_pcid_enabled(current) && (val & X86_CR3_NOFLUSH);
> +        if ( noflush )
> +            val &= ~X86_CR3_NOFLUSH;
> +        rc = hvm_set_cr3(val, noflush, true);
>          break;
> 
>      case 4:
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -2059,12 +2059,17 @@ int hvm_mov_to_cr(unsigned int cr, unsig
> 
>      switch ( cr )
>      {
> +        bool noflush;
> +
>      case 0:
>          rc = hvm_set_cr0(val, true);
>          break;
> 
>      case 3:
> -        rc = hvm_set_cr3(val, true);
> +        noflush = hvm_pcid_enabled(curr) && (val & X86_CR3_NOFLUSH);
> +        if ( noflush )
> +            val &= ~X86_CR3_NOFLUSH;
> +        rc = hvm_set_cr3(val, noflush, true);
>          break;
> 
>      case 4:
> @@ -2282,12 +2287,11 @@ int hvm_set_cr0(unsigned long value, boo
>      return X86EMUL_OKAY;
>  }
> 
> -int hvm_set_cr3(unsigned long value, bool may_defer)
> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
>  {
>      struct vcpu *v = current;
>      struct page_info *page;
>      unsigned long old = v->arch.hvm.guest_cr[3];
> -    bool noflush = false;
> 
>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
> @@ -2299,17 +2303,12 @@ int hvm_set_cr3(unsigned long value, boo
>              /* The actual write will occur in hvm_do_resume(), if permitted. */
>              v->arch.vm_event->write_data.do_write.cr3 = 1;
>              v->arch.vm_event->write_data.cr3 = value;
> +            v->arch.vm_event->write_data.cr3_noflush = noflush;
> 
>              return X86EMUL_OKAY;
>          }
>      }
> 
> -    if ( hvm_pcid_enabled(v) ) /* Clear the noflush bit. */
> -    {
> -        noflush = value & X86_CR3_NOFLUSH;
> -        value &= ~X86_CR3_NOFLUSH;
> -    }
> -
>      if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
>           (value != v->arch.hvm.guest_cr[3]) )
>      {
> @@ -3004,7 +3003,7 @@ void hvm_task_switch(
>      if ( task_switch_load_seg(x86_seg_ldtr, tss.ldt, new_cpl, 0) )
>          goto out;
> 
> -    rc = hvm_set_cr3(tss.cr3, true);
> +    rc = hvm_set_cr3(tss.cr3, false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
>      if ( rc != X86EMUL_OKAY )
> --- a/xen/arch/x86/hvm/svm/nestedsvm.c
> +++ b/xen/arch/x86/hvm/svm/nestedsvm.c
> @@ -324,7 +324,7 @@ static int nsvm_vcpu_hostrestore(struct
>          v->arch.guest_table = pagetable_null();
>          /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
>      }
> -    rc = hvm_set_cr3(n1vmcb->_cr3, true);
> +    rc = hvm_set_cr3(n1vmcb->_cr3, false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
>      if (rc != X86EMUL_OKAY)
> @@ -584,7 +584,7 @@ static int nsvm_vmcb_prepare4vmrun(struc
>          nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
> 
>          /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
> -        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
> +        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
>          if ( rc == X86EMUL_EXCEPTION )
>              hvm_inject_hw_exception(TRAP_gp_fault, 0);
>          if (rc != X86EMUL_OKAY)
> @@ -598,7 +598,7 @@ static int nsvm_vmcb_prepare4vmrun(struc
>           * we assume it intercepts page faults.
>           */
>          /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
> -        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
> +        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
>          if ( rc == X86EMUL_EXCEPTION )
>              hvm_inject_hw_exception(TRAP_gp_fault, 0);
>          if (rc != X86EMUL_OKAY)
> --- a/xen/arch/x86/hvm/vm_event.c
> +++ b/xen/arch/x86/hvm/vm_event.c
> @@ -110,7 +110,7 @@ void hvm_vm_event_do_resume(struct vcpu
> 
>      if ( unlikely(w->do_write.cr3) )
>      {
> -        if ( hvm_set_cr3(w->cr3, false) == X86EMUL_EXCEPTION )
> +        if ( hvm_set_cr3(w->cr3, w->cr3_noflush, false) == X86EMUL_EXCEPTION )
>              hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
>          w->do_write.cr3 = 0;
> --- a/xen/arch/x86/hvm/vmx/vvmx.c
> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> @@ -1032,7 +1032,7 @@ static void load_shadow_guest_state(stru
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> -    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), true);
> +    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> @@ -1246,7 +1246,7 @@ static void load_vvmcs_host_state(struct
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> -    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), true);
> +    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -274,6 +274,8 @@ struct monitor_write_data {
>          unsigned int cr4 : 1;
>      } do_write;
> 
> +    bool cr3_noflush;
> +
>      uint32_t msr;
>      uint64_t value;
>      uint64_t cr0;
> --- a/xen/include/asm-x86/hvm/support.h
> +++ b/xen/include/asm-x86/hvm/support.h
> @@ -135,7 +135,7 @@ void hvm_shadow_handle_cd(struct vcpu *v
>   */
>  int hvm_set_efer(uint64_t value);
>  int hvm_set_cr0(unsigned long value, bool may_defer);
> -int hvm_set_cr3(unsigned long value, bool may_defer);
> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer);
>  int hvm_set_cr4(unsigned long value, bool may_defer);
>  int hvm_descriptor_access_intercept(uint64_t exit_info,
>                                      uint64_t vmx_exit_qualification,
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xenproject.org
> https://lists.xenproject.org/mailman/listinfo/xen-devel
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
Posted by Jan Beulich 4 years, 7 months ago
On 17.09.2019 14:45, Paul Durrant wrote:
>> From: Xen-devel <xen-devel-bounces@lists.xenproject.org> On Behalf Of Jan Beulich
>> Sent: 17 September 2019 07:15
>>
>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -2080,6 +2080,8 @@ static int hvmemul_write_cr(
>>      HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
>>      switch ( reg )
>>      {
>> +        bool noflush;
>> +
> 
> I said this before... I think the tighter case-scope is better.

I recall you saying so, but I don't recall you actually making this a
requirement to get your ack.

> Cosmetically it may look a little odd, but surely it gives the
> compiler a better chance to optimize?

I don't think so - they're pretty good to limit the life range of
variables (at least in not overly hairy functions) these days. The
more narrow scopes we often ask for are more for the reader to
easily know what the intended usage range of a variable is.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
Posted by Paul Durrant 4 years, 7 months ago
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: 17 September 2019 14:03
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: xen-devel@lists.xenproject.org; Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>; Andrew
> Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Roger Pau Monne
> <roger.pau@citrix.com>; Jun Nakajima <jun.nakajima@intel.com>; Kevin Tian <kevin.tian@intel.com>;
> BorisOstrovsky <boris.ostrovsky@oracle.com>; Paul Durrant <paul@xen.org>; Wei Liu <wl@xen.org>
> Subject: Re: [Xen-devel] [PATCH v2 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
> 
> On 17.09.2019 14:45, Paul Durrant wrote:
> >> From: Xen-devel <xen-devel-bounces@lists.xenproject.org> On Behalf Of Jan Beulich
> >> Sent: 17 September 2019 07:15
> >>
> >> --- a/xen/arch/x86/hvm/emulate.c
> >> +++ b/xen/arch/x86/hvm/emulate.c
> >> @@ -2080,6 +2080,8 @@ static int hvmemul_write_cr(
> >>      HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
> >>      switch ( reg )
> >>      {
> >> +        bool noflush;
> >> +
> >
> > I said this before... I think the tighter case-scope is better.
> 
> I recall you saying so, but I don't recall you actually making this a
> requirement to get your ack.
> 
> > Cosmetically it may look a little odd, but surely it gives the
> > compiler a better chance to optimize?
> 
> I don't think so - they're pretty good to limit the life range of
> variables (at least in not overly hairy functions) these days. The
> more narrow scopes we often ask for are more for the reader to
> easily know what the intended usage range of a variable is.

Ok. I'm not going to insist, but I would still prefer case-scope here.

Reviewed-by: Paul Durrant <paul@xen.org>

> 
> Jan
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 5/9] x86/HVM: refuse CR3 loads with reserved (upper) bits set
Posted by Jan Beulich 4 years, 7 months ago
While bits 11 and below are, if not used for other purposes, reserved
but ignored, bits beyond physical address width are supposed to raise
exceptions (at least in the non-nested case; I'm not convinced the
current nested SVM/VMX behavior of raising #GP(0) here is correct, but
that's not the subject of this change).

Introduce currd as a local variable, and replace other v->domain
instances at the same time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
---
v2: Simplify the expressions used for the reserved bit checks.

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1004,6 +1004,13 @@ static int hvm_load_cpu_ctxt(struct doma
         return -EINVAL;
     }
 
+    if ( ctxt.cr3 >> d->arch.cpuid->extd.maxphysaddr )
+    {
+        printk(XENLOG_G_ERR "HVM%d restore: bad CR3 %#" PRIx64 "\n",
+               d->domain_id, ctxt.cr3);
+        return X86EMUL_EXCEPTION;
+    }
+
     if ( (ctxt.flags & ~XEN_X86_FPU_INITIALISED) != 0 )
     {
         gprintk(XENLOG_ERR, "bad flags value in CPU context: %#x\n",
@@ -2290,10 +2297,18 @@ int hvm_set_cr0(unsigned long value, boo
 int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
 {
     struct vcpu *v = current;
+    struct domain *currd = v->domain;
     struct page_info *page;
     unsigned long old = v->arch.hvm.guest_cr[3];
 
-    if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
+    if ( value >> currd->arch.cpuid->extd.maxphysaddr )
+    {
+        HVM_DBG_LOG(DBG_LEVEL_1,
+                    "Attempt to set reserved CR3 bit(s): %lx", value);
+        return X86EMUL_EXCEPTION;
+    }
+
+    if ( may_defer && unlikely(currd->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
     {
         ASSERT(v->arch.vm_event);
@@ -2309,13 +2324,12 @@ int hvm_set_cr3(unsigned long value, boo
         }
     }
 
-    if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
+    if ( hvm_paging_enabled(v) && !paging_mode_hap(currd) &&
          (value != v->arch.hvm.guest_cr[3]) )
     {
         /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
-        page = get_page_from_gfn(v->domain, value >> PAGE_SHIFT,
-                                 NULL, P2M_ALLOC);
+        page = get_page_from_gfn(currd, value >> PAGE_SHIFT, NULL, P2M_ALLOC);
         if ( !page )
             goto bad_cr3;
 
@@ -2331,7 +2345,7 @@ int hvm_set_cr3(unsigned long value, boo
 
  bad_cr3:
     gdprintk(XENLOG_ERR, "Invalid CR3\n");
-    domain_crash(v->domain);
+    domain_crash(currd);
     return X86EMUL_UNHANDLEABLE;
 }
 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 5/9] x86/HVM: refuse CR3 loads with reserved (upper) bits set
Posted by Andrew Cooper 4 years, 7 months ago
On 17/09/2019 07:15, Jan Beulich wrote:
> While bits 11 and below are, if not used for other purposes, reserved
> but ignored, bits beyond physical address width are supposed to raise
> exceptions (at least in the non-nested case; I'm not convinced the
> current nested SVM/VMX behavior of raising #GP(0) here is correct, but
> that's not the subject of this change).
>
> Introduce currd as a local variable, and replace other v->domain
> instances at the same time.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
> ---
> v2: Simplify the expressions used for the reserved bit checks.
>
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -1004,6 +1004,13 @@ static int hvm_load_cpu_ctxt(struct doma
>          return -EINVAL;
>      }
>  
> +    if ( ctxt.cr3 >> d->arch.cpuid->extd.maxphysaddr )
> +    {
> +        printk(XENLOG_G_ERR "HVM%d restore: bad CR3 %#" PRIx64 "\n",
> +               d->domain_id, ctxt.cr3);
> +        return X86EMUL_EXCEPTION;

-EINVAL, surely?

Everything else LGTM (although I guess it depends on the previous
patch?) so with this fixed, Reviewed-by: Andrew Cooper
<andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 5/9] x86/HVM: refuse CR3 loads with reserved (upper) bits set
Posted by Jan Beulich 4 years, 7 months ago
On 17.09.2019 21:35, Andrew Cooper wrote:
> On 17/09/2019 07:15, Jan Beulich wrote:
>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -1004,6 +1004,13 @@ static int hvm_load_cpu_ctxt(struct doma
>>          return -EINVAL;
>>      }
>>  
>> +    if ( ctxt.cr3 >> d->arch.cpuid->extd.maxphysaddr )
>> +    {
>> +        printk(XENLOG_G_ERR "HVM%d restore: bad CR3 %#" PRIx64 "\n",
>> +               d->domain_id, ctxt.cr3);
>> +        return X86EMUL_EXCEPTION;
> 
> -EINVAL, surely?

Oh, indeed. Thanks for spotting.

> Everything else LGTM (although I guess it depends on the previous
> patch?)

It does, yes.

> so with this fixed, Reviewed-by: Andrew Cooper
> <andrew.cooper3@citrix.com>

Thanks.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 6/9] x86/HVM: relax shadow mode check in hvm_set_cr3()
Posted by Jan Beulich 4 years, 7 months ago
There's no need to re-obtain a page reference if only bits not affecting
the address change.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2325,7 +2325,7 @@ int hvm_set_cr3(unsigned long value, boo
     }
 
     if ( hvm_paging_enabled(v) && !paging_mode_hap(currd) &&
-         (value != v->arch.hvm.guest_cr[3]) )
+         ((value ^ v->arch.hvm.guest_cr[3]) >> PAGE_SHIFT) )
     {
         /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 6/9] x86/HVM: relax shadow mode check in hvm_set_cr3()
Posted by Andrew Cooper 4 years, 7 months ago
On 17/09/2019 07:16, Jan Beulich wrote:
> There's no need to re-obtain a page reference if only bits not affecting
> the address change.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 7/9] x86/HVM: cosmetics to hvm_set_cr3()
Posted by Jan Beulich 4 years, 7 months ago
Eliminate the not really useful local variable "old". Reduce the scope
of "page". Rename the latched "current".

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
---
v2: Re-base over change earlier in the series.

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2296,10 +2296,8 @@ int hvm_set_cr0(unsigned long value, boo
 
 int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
 {
-    struct vcpu *v = current;
-    struct domain *currd = v->domain;
-    struct page_info *page;
-    unsigned long old = v->arch.hvm.guest_cr[3];
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
 
     if ( value >> currd->arch.cpuid->extd.maxphysaddr )
     {
@@ -2311,36 +2309,38 @@ int hvm_set_cr3(unsigned long value, boo
     if ( may_defer && unlikely(currd->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
     {
-        ASSERT(v->arch.vm_event);
+        ASSERT(curr->arch.vm_event);
 
-        if ( hvm_monitor_crX(CR3, value, old) )
+        if ( hvm_monitor_crX(CR3, value, curr->arch.hvm.guest_cr[3]) )
         {
             /* The actual write will occur in hvm_do_resume(), if permitted. */
-            v->arch.vm_event->write_data.do_write.cr3 = 1;
-            v->arch.vm_event->write_data.cr3 = value;
-            v->arch.vm_event->write_data.cr3_noflush = noflush;
+            curr->arch.vm_event->write_data.do_write.cr3 = 1;
+            curr->arch.vm_event->write_data.cr3 = value;
+            curr->arch.vm_event->write_data.cr3_noflush = noflush;
 
             return X86EMUL_OKAY;
         }
     }
 
-    if ( hvm_paging_enabled(v) && !paging_mode_hap(currd) &&
-         ((value ^ v->arch.hvm.guest_cr[3]) >> PAGE_SHIFT) )
+    if ( hvm_paging_enabled(curr) && !paging_mode_hap(currd) &&
+         ((value ^ curr->arch.hvm.guest_cr[3]) >> PAGE_SHIFT) )
     {
         /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
+        struct page_info *page;
+
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
         page = get_page_from_gfn(currd, value >> PAGE_SHIFT, NULL, P2M_ALLOC);
         if ( !page )
             goto bad_cr3;
 
-        put_page(pagetable_get_page(v->arch.guest_table));
-        v->arch.guest_table = pagetable_from_page(page);
+        put_page(pagetable_get_page(curr->arch.guest_table));
+        curr->arch.guest_table = pagetable_from_page(page);
 
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
     }
 
-    v->arch.hvm.guest_cr[3] = value;
-    paging_update_cr3(v, noflush);
+    curr->arch.hvm.guest_cr[3] = value;
+    paging_update_cr3(curr, noflush);
     return X86EMUL_OKAY;
 
  bad_cr3:


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 7/9] x86/HVM: cosmetics to hvm_set_cr3()
Posted by Andrew Cooper 4 years, 7 months ago
On 17/09/2019 07:16, Jan Beulich wrote:
> Eliminate the not really useful local variable "old". Reduce the scope
> of "page". Rename the latched "current".
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 8/9] x86/CPUID: drop INVPCID dependency on PCID
Posted by Jan Beulich 4 years, 7 months ago
PCID validly depends on LM, as it can be enabled in Long Mode only.
INVPCID, otoh, can be used not only without PCID enabled, but also
outside of Long Mode altogether. In both cases its functionality is
simply restricted to PCID 0, which is sort of expected as no other PCID
can be activated there.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>

--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -218,10 +218,6 @@ def crunch_numbers(state):
         #
         # SSE4_2: [POPCNT]
 
-        # The INVPCID instruction depends on PCID infrastructure being
-        # available.
-        PCID: [INVPCID],
-
         # XSAVE is an extra set of instructions for state management, but
         # doesn't constitue new state itself.  Some of the dependent features
         # are instructions built on top of base XSAVE, while others are new


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 8/9] x86/CPUID: drop INVPCID dependency on PCID
Posted by Andrew Cooper 4 years, 7 months ago
On 17/09/2019 07:17, Jan Beulich wrote:
> PCID validly depends on LM, as it can be enabled in Long Mode only.
> INVPCID, otoh, can be used not only without PCID enabled, but also
> outside of Long Mode altogether. In both cases its functionality is
> simply restricted to PCID 0, which is sort of expected as no other PCID
> can be activated there.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
>
> --- a/xen/tools/gen-cpuid.py
> +++ b/xen/tools/gen-cpuid.py
> @@ -218,10 +218,6 @@ def crunch_numbers(state):
>          #
>          # SSE4_2: [POPCNT]
>  
> -        # The INVPCID instruction depends on PCID infrastructure being
> -        # available.
> -        PCID: [INVPCID],

I take it you haven't tried running a shadow guest with this change in
place.

For !EPT, we clear ENABLE_INVPCID so the instruction will #UD within the
guest.

We could in principle allow the use of INVPCID with shadow guests, but
that would involve teaching Xen how to cope with VMEXIT_REASON_INVPCID
(which is predicated on INVLPG exiting) and wire the flush back into the
shadow logic.

Perhaps the comment could be a little clearer, but it is no accident
that dependency is in place.

~Andrew

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 8/9] x86/CPUID: drop INVPCID dependency on PCID
Posted by Jan Beulich 4 years, 7 months ago
On 17.09.2019 21:59, Andrew Cooper wrote:
> On 17/09/2019 07:17, Jan Beulich wrote:
>> PCID validly depends on LM, as it can be enabled in Long Mode only.
>> INVPCID, otoh, can be used not only without PCID enabled, but also
>> outside of Long Mode altogether. In both cases its functionality is
>> simply restricted to PCID 0, which is sort of expected as no other PCID
>> can be activated there.
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
>>
>> --- a/xen/tools/gen-cpuid.py
>> +++ b/xen/tools/gen-cpuid.py
>> @@ -218,10 +218,6 @@ def crunch_numbers(state):
>>          #
>>          # SSE4_2: [POPCNT]
>>  
>> -        # The INVPCID instruction depends on PCID infrastructure being
>> -        # available.
>> -        PCID: [INVPCID],
> 
> I take it you haven't tried running a shadow guest with this change in
> place.
> 
> For !EPT, we clear ENABLE_INVPCID so the instruction will #UD within the
> guest.
> 
> We could in principle allow the use of INVPCID with shadow guests, but
> that would involve teaching Xen how to cope with VMEXIT_REASON_INVPCID
> (which is predicated on INVLPG exiting) and wire the flush back into the
> shadow logic.
> 
> Perhaps the comment could be a little clearer, but it is no accident
> that dependency is in place.

I'm afraid I don't follow: Both features are marked H in the
public header, so I don't see how shadow mode considerations
apply here at all.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 8/9] x86/CPUID: drop INVPCID dependency on PCID
Posted by Andrew Cooper 4 years, 7 months ago
On 18/09/2019 10:10, Jan Beulich wrote:
> On 17.09.2019 21:59, Andrew Cooper wrote:
>> On 17/09/2019 07:17, Jan Beulich wrote:
>>> PCID validly depends on LM, as it can be enabled in Long Mode only.
>>> INVPCID, otoh, can be used not only without PCID enabled, but also
>>> outside of Long Mode altogether. In both cases its functionality is
>>> simply restricted to PCID 0, which is sort of expected as no other PCID
>>> can be activated there.
>>>
>>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>>> Reviewed-by: Roger Pau Monné <roger.pau@citrix.com>
>>>
>>> --- a/xen/tools/gen-cpuid.py
>>> +++ b/xen/tools/gen-cpuid.py
>>> @@ -218,10 +218,6 @@ def crunch_numbers(state):
>>>          #
>>>          # SSE4_2: [POPCNT]
>>>  
>>> -        # The INVPCID instruction depends on PCID infrastructure being
>>> -        # available.
>>> -        PCID: [INVPCID],
>> I take it you haven't tried running a shadow guest with this change in
>> place.
>>
>> For !EPT, we clear ENABLE_INVPCID so the instruction will #UD within the
>> guest.
>>
>> We could in principle allow the use of INVPCID with shadow guests, but
>> that would involve teaching Xen how to cope with VMEXIT_REASON_INVPCID
>> (which is predicated on INVLPG exiting) and wire the flush back into the
>> shadow logic.
>>
>> Perhaps the comment could be a little clearer, but it is no accident
>> that dependency is in place.
> I'm afraid I don't follow: Both features are marked H in the
> public header, so I don't see how shadow mode considerations
> apply here at all.

Oh.  Of course.  I'm not confused as to why that was present to being with.

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v2 9/9] x86: PCID is unused when !PV
Posted by Jan Beulich 4 years, 7 months ago
This allows in particular some streamlining of the TLB flushing code
paths.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v2: Avoid #ifdef in cr3_pcid().

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -24,6 +24,11 @@
 #define WRAP_MASK (0x000003FFU)
 #endif
 
+#ifndef CONFIG_PV
+# undef X86_CR4_PCIDE
+# define X86_CR4_PCIDE 0
+#endif
+
 u32 tlbflush_clock = 1U;
 DEFINE_PER_CPU(u32, tlbflush_time);
 
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -289,7 +289,7 @@ static inline unsigned long cr3_pa(unsig
 
 static inline unsigned int cr3_pcid(unsigned long cr3)
 {
-    return cr3 & X86_CR3_PCID_MASK;
+    return IS_ENABLED(CONFIG_PV) ? cr3 & X86_CR3_PCID_MASK : 0;
 }
 
 static inline unsigned long read_cr4(void)
@@ -301,8 +301,12 @@ static inline void write_cr4(unsigned lo
 {
     struct cpu_info *info = get_cpu_info();
 
+#ifdef CONFIG_PV
     /* No global pages in case of PCIDs enabled! */
     ASSERT(!(val & X86_CR4_PGE) || !(val & X86_CR4_PCIDE));
+#else
+    ASSERT(!(val & X86_CR4_PCIDE));
+#endif
 
     /*
      * On hardware supporting FSGSBASE, the value in %cr4 is the kernel's
--- a/xen/include/asm-x86/pv/domain.h
+++ b/xen/include/asm-x86/pv/domain.h
@@ -50,8 +50,13 @@
  */
 static inline unsigned long get_pcid_bits(const struct vcpu *v, bool is_xpti)
 {
+#ifdef CONFIG_PV
     return X86_CR3_NOFLUSH | (is_xpti ? PCID_PV_XPTI : 0) |
            ((v->arch.flags & TF_kernel_mode) ? PCID_PV_PRIV : PCID_PV_USER);
+#else
+    ASSERT_UNREACHABLE();
+    return 0;
+#endif
 }
 
 #ifdef CONFIG_PV


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v2 9/9] x86: PCID is unused when !PV
Posted by Andrew Cooper 4 years, 7 months ago
On 17/09/2019 07:17, Jan Beulich wrote:
> This allows in particular some streamlining of the TLB flushing code
> paths.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel