[Xen-devel] [PATCH 0/9] XSA-292 follow-up

Jan Beulich posted 9 patches 62 weeks ago
Only 0 patches received!

[Xen-devel] [PATCH 0/9] XSA-292 follow-up

Posted by Jan Beulich 62 weeks ago
Various CR3 and PCID related adjustments, first and foremost
an almost full re-write of switch_cr3_cr4() (in patch 2).

1: x86: adjust cr3_pcid() return type
2: x86: limit the amount of TLB flushing in switch_cr3_cr4()
3: x86/mm: honor opt_pcid also for 32-bit PV domains
4: x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
5: x86/HVM: refuse CR3 loads with reserved (upper) bits set
6: x86/HVM: relax shadow mode check in hvm_set_cr3()
7: x86/HVM: cosmetics to hvm_set_cr3()
8: x86/CPUID: drop INVPCID dependency on PCID
9: x86: PCID is unused when !PV

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH 1/9] x86: adjust cr3_pcid() return type

Posted by Jan Beulich 62 weeks ago
There's no need for it to be 64 bits wide - only the low twelve bits
of CR3 hold the PCID.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -103,7 +103,8 @@ static void do_tlb_flush(void)
 
 void switch_cr3_cr4(unsigned long cr3, unsigned long cr4)
 {
-    unsigned long flags, old_cr4, old_pcid;
+    unsigned long flags, old_cr4;
+    unsigned int old_pcid;
     u32 t;
 
     /* This non-reentrant function is sometimes called in interrupt context. */
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -292,7 +292,7 @@ static inline unsigned long cr3_pa(unsig
     return cr3 & X86_CR3_ADDR_MASK;
 }
 
-static inline unsigned long cr3_pcid(unsigned long cr3)
+static inline unsigned int cr3_pcid(unsigned long cr3)
 {
     return cr3 & X86_CR3_PCID_MASK;
 }



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH 2/9] x86: limit the amount of TLB flushing in switch_cr3_cr4()

Posted by Jan Beulich 62 weeks ago
We really need to flush the TLB just once, if we do so with or after the
CR3 write. The only case where two flushes are unavoidable is when we
mean to turn off CR4.PGE (perhaps just temporarily; see the code
comment).

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -104,82 +104,65 @@ static void do_tlb_flush(void)
 void switch_cr3_cr4(unsigned long cr3, unsigned long cr4)
 {
     unsigned long flags, old_cr4;
-    unsigned int old_pcid;
     u32 t;
 
+    /* Throughout this function we make this assumption: */
+    ASSERT(!(cr4 & X86_CR4_PCIDE) || !(cr4 & X86_CR4_PGE));
+
     /* This non-reentrant function is sometimes called in interrupt context. */
     local_irq_save(flags);
 
     t = pre_flush();
 
     old_cr4 = read_cr4();
-    if ( old_cr4 & X86_CR4_PGE )
+    ASSERT(!(old_cr4 & X86_CR4_PCIDE) || !(old_cr4 & X86_CR4_PGE));
+
+    /*
+     * We need to write CR4 before CR3 if we're about to enable PCIDE, at the
+     * very least when the new PCID is non-zero.
+     *
+     * As we also need to do two CR4 writes in total when PGE is enabled and
+     * is to remain enabled, do the one temporarily turning off the bit right
+     * here as well.
+     *
+     * The only TLB flushing effect we depend on here is in case we move from
+     * PGE set to PCIDE set, where we want global page entries gone (and none
+     * to re-appear) after this write.
+     */
+    if ( !(old_cr4 & X86_CR4_PCIDE) &&
+         ((cr4 & X86_CR4_PCIDE) || (cr4 & old_cr4 & X86_CR4_PGE)) )
     {
-        /*
-         * X86_CR4_PGE set means PCID is inactive.
-         * We have to purge the TLB via flipping cr4.pge.
-         */
         old_cr4 = cr4 & ~X86_CR4_PGE;
         write_cr4(old_cr4);
     }
-    else if ( use_invpcid )
-    {
-        /*
-         * Flushing the TLB via INVPCID is necessary only in case PCIDs are
-         * in use, which is true only with INVPCID being available.
-         * Without PCID usage the following write_cr3() will purge the TLB
-         * (we are in the cr4.pge off path) of all entries.
-         * Using invpcid_flush_all_nonglobals() seems to be faster than
-         * invpcid_flush_all(), so use that.
-         */
-        invpcid_flush_all_nonglobals();
-
-        /*
-         * CR4.PCIDE needs to be set before the CR3 write below. Otherwise
-         * - the CR3 write will fault when CR3.NOFLUSH is set (which is the
-         *   case normally),
-         * - the subsequent CR4 write will fault if CR3.PCID != 0.
-         */
-        if ( (old_cr4 & X86_CR4_PCIDE) < (cr4 & X86_CR4_PCIDE) )
-        {
-            write_cr4(cr4);
-            old_cr4 = cr4;
-        }
-    }
 
     /*
-     * If we don't change PCIDs, the CR3 write below needs to flush this very
-     * PCID, even when a full flush was performed above, as we are currently
-     * accumulating TLB entries again from the old address space.
-     * NB: Clearing the bit when we don't use PCID is benign (as it is clear
-     * already in that case), but allows the if() to be more simple.
+     * If the CR4 write is to turn off PCIDE, we don't need the CR3 write to
+     * flush anything, as that transition is a full flush itself.
      */
-    old_pcid = cr3_pcid(read_cr3());
-    if ( old_pcid == cr3_pcid(cr3) )
-        cr3 &= ~X86_CR3_NOFLUSH;
-
+    if ( (old_cr4 & X86_CR4_PCIDE) > (cr4 & X86_CR4_PCIDE) )
+        cr3 |= X86_CR3_NOFLUSH;
     write_cr3(cr3);
 
     if ( old_cr4 != cr4 )
         write_cr4(cr4);
 
     /*
-     * Make sure no TLB entries related to the old PCID created between
-     * flushing the TLB and writing the new %cr3 value remain in the TLB.
-     *
-     * The write to CR4 just above has performed a wider flush in certain
-     * cases, which therefore get excluded here. Since that write is
-     * conditional, note in particular that it won't be skipped if PCIDE
-     * transitions from 1 to 0. This is because the CR4 write further up will
-     * have been skipped in this case, as PCIDE and PGE won't both be set at
-     * the same time.
-     *
-     * Note also that PGE is always clear in old_cr4.
+     *  PGE  | PCIDE | flush at
+     * ------+-------+------------------------
+     *  0->0 | 0->0  | CR3 write
+     *  0->0 | 0->1  | n/a (see 1st CR4 write)
+     *  0->x | 1->0  | CR4 write
+     *  x->1 | x->1  | n/a
+     *  0->0 | 1->1  | INVPCID
+     *  0->1 | 0->0  | CR3 and CR4 writes
+     *  1->0 | 0->0  | CR4 write
+     *  1->0 | 0->1  | n/a (see 1st CR4 write)
+     *  1->1 | 0->0  | n/a (see 1st CR4 write)
+     *  1->x | 1->x  | n/a
      */
-    if ( old_pcid != cr3_pcid(cr3) &&
-         !(cr4 & X86_CR4_PGE) &&
-         (old_cr4 & X86_CR4_PCIDE) <= (cr4 & X86_CR4_PCIDE) )
-        invpcid_flush_single_context(old_pcid);
+    if ( cr4 & X86_CR4_PCIDE )
+        invpcid_flush_all_nonglobals();
 
     post_flush(t);
 




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH 3/9] x86/mm: honor opt_pcid also for 32-bit PV domains

Posted by Jan Beulich 62 weeks ago
I can't see any technical or performance reason why we should treat
32-bit PV different from 64-bit PV in this regard.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/pv/domain.c
+++ b/xen/arch/x86/pv/domain.c
@@ -180,7 +180,24 @@ int switch_compat(struct domain *d)
     d->arch.x87_fip_width = 4;
 
     d->arch.pv.xpti = false;
-    d->arch.pv.pcid = false;
+
+    if ( use_invpcid && cpu_has_pcid )
+        switch ( ACCESS_ONCE(opt_pcid) )
+        {
+        case PCID_OFF:
+        case PCID_XPTI:
+            d->arch.pv.pcid = false;
+            break;
+
+        case PCID_ALL:
+        case PCID_NOXPTI:
+            d->arch.pv.pcid = true;
+            break;
+
+        default:
+            ASSERT_UNREACHABLE();
+            break;
+        }
 
     return 0;
 
@@ -312,7 +329,7 @@ int pv_domain_initialise(struct domain *
 
     d->arch.pv.xpti = is_hardware_domain(d) ? opt_xpti_hwdom : opt_xpti_domu;
 
-    if ( !is_pv_32bit_domain(d) && use_invpcid && cpu_has_pcid )
+    if ( use_invpcid && cpu_has_pcid )
         switch ( ACCESS_ONCE(opt_pcid) )
         {
         case PCID_OFF:





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()

Posted by Jan Beulich 62 weeks ago
The bit is meaningful only for MOV-to-CR3 insns, not anywhere else, in
particular not when loading nested guest state.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -2072,6 +2072,8 @@ static int hvmemul_write_cr(
     HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
     switch ( reg )
     {
+        bool noflush;
+
     case 0:
         rc = hvm_set_cr0(val, true);
         break;
@@ -2082,7 +2084,10 @@ static int hvmemul_write_cr(
         break;
 
     case 3:
-        rc = hvm_set_cr3(val, true);
+        noflush = hvm_pcid_enabled(current) && (val & X86_CR3_NOFLUSH);
+        if ( noflush )
+            val &= ~X86_CR3_NOFLUSH;
+        rc = hvm_set_cr3(val, noflush, true);
         break;
 
     case 4:
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2053,12 +2053,17 @@ int hvm_mov_to_cr(unsigned int cr, unsig
 
     switch ( cr )
     {
+        bool noflush;
+
     case 0:
         rc = hvm_set_cr0(val, true);
         break;
 
     case 3:
-        rc = hvm_set_cr3(val, true);
+        noflush = hvm_pcid_enabled(curr) && (val & X86_CR3_NOFLUSH);
+        if ( noflush )
+            val &= ~X86_CR3_NOFLUSH;
+        rc = hvm_set_cr3(val, noflush, true);
         break;
 
     case 4:
@@ -2276,12 +2281,11 @@ int hvm_set_cr0(unsigned long value, boo
     return X86EMUL_OKAY;
 }
 
-int hvm_set_cr3(unsigned long value, bool may_defer)
+int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
 {
     struct vcpu *v = current;
     struct page_info *page;
     unsigned long old = v->arch.hvm.guest_cr[3];
-    bool noflush = false;
 
     if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
@@ -2293,17 +2297,12 @@ int hvm_set_cr3(unsigned long value, boo
             /* The actual write will occur in hvm_do_resume(), if permitted. */
             v->arch.vm_event->write_data.do_write.cr3 = 1;
             v->arch.vm_event->write_data.cr3 = value;
+            v->arch.vm_event->write_data.cr3_noflush = noflush;
 
             return X86EMUL_OKAY;
         }
     }
 
-    if ( hvm_pcid_enabled(v) ) /* Clear the noflush bit. */
-    {
-        noflush = value & X86_CR3_NOFLUSH;
-        value &= ~X86_CR3_NOFLUSH;
-    }
-
     if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
          (value != v->arch.hvm.guest_cr[3]) )
     {
@@ -2998,7 +2997,7 @@ void hvm_task_switch(
     if ( task_switch_load_seg(x86_seg_ldtr, tss.ldt, new_cpl, 0) )
         goto out;
 
-    rc = hvm_set_cr3(tss.cr3, true);
+    rc = hvm_set_cr3(tss.cr3, false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
     if ( rc != X86EMUL_OKAY )
--- a/xen/arch/x86/hvm/svm/nestedsvm.c
+++ b/xen/arch/x86/hvm/svm/nestedsvm.c
@@ -324,7 +324,7 @@ static int nsvm_vcpu_hostrestore(struct
         v->arch.guest_table = pagetable_null();
         /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
     }
-    rc = hvm_set_cr3(n1vmcb->_cr3, true);
+    rc = hvm_set_cr3(n1vmcb->_cr3, false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
     if (rc != X86EMUL_OKAY)
@@ -584,7 +584,7 @@ static int nsvm_vmcb_prepare4vmrun(struc
         nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
 
         /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
-        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
+        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
         if ( rc == X86EMUL_EXCEPTION )
             hvm_inject_hw_exception(TRAP_gp_fault, 0);
         if (rc != X86EMUL_OKAY)
@@ -598,7 +598,7 @@ static int nsvm_vmcb_prepare4vmrun(struc
          * we assume it intercepts page faults.
          */
         /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
-        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
+        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
         if ( rc == X86EMUL_EXCEPTION )
             hvm_inject_hw_exception(TRAP_gp_fault, 0);
         if (rc != X86EMUL_OKAY)
--- a/xen/arch/x86/hvm/vm_event.c
+++ b/xen/arch/x86/hvm/vm_event.c
@@ -110,7 +110,7 @@ void hvm_vm_event_do_resume(struct vcpu
 
     if ( unlikely(w->do_write.cr3) )
     {
-        if ( hvm_set_cr3(w->cr3, false) == X86EMUL_EXCEPTION )
+        if ( hvm_set_cr3(w->cr3, w->cr3_noflush, false) == X86EMUL_EXCEPTION )
             hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
         w->do_write.cr3 = 0;
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -1028,7 +1028,7 @@ static void load_shadow_guest_state(stru
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
-    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), true);
+    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
@@ -1242,7 +1242,7 @@ static void load_vvmcs_host_state(struct
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
-    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), true);
+    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), false, true);
     if ( rc == X86EMUL_EXCEPTION )
         hvm_inject_hw_exception(TRAP_gp_fault, 0);
 
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -275,6 +275,8 @@ struct monitor_write_data {
         unsigned int cr4 : 1;
     } do_write;
 
+    bool cr3_noflush;
+
     uint32_t msr;
     uint64_t value;
     uint64_t cr0;
--- a/xen/include/asm-x86/hvm/support.h
+++ b/xen/include/asm-x86/hvm/support.h
@@ -135,7 +135,7 @@ void hvm_shadow_handle_cd(struct vcpu *v
  */
 int hvm_set_efer(uint64_t value);
 int hvm_set_cr0(unsigned long value, bool may_defer);
-int hvm_set_cr3(unsigned long value, bool may_defer);
+int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer);
 int hvm_set_cr4(unsigned long value, bool may_defer);
 int hvm_descriptor_access_intercept(uint64_t exit_info,
                                     uint64_t vmx_exit_qualification,




_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()

Posted by Paul Durrant 62 weeks ago
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 02 May 2019 13:20
> To: xen-devel <xen-devel@lists.xenproject.org>
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; Paul Durrant <Paul.Durrant@citrix.com>; Roger Pau Monne
> <roger.pau@citrix.com>; Wei Liu <wei.liu2@citrix.com>; George Dunlap <George.Dunlap@citrix.com>
> Subject: [PATCH 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
> 
> The bit is meaningful only for MOV-to-CR3 insns, not anywhere else, in
> particular not when loading nested guest state.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -2072,6 +2072,8 @@ static int hvmemul_write_cr(
>      HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
>      switch ( reg )
>      {
> +        bool noflush;
> +

Why introduce 'noflush' with this scope when it could be limited to 'case 3:', although...

>      case 0:
>          rc = hvm_set_cr0(val, true);
>          break;
> @@ -2082,7 +2084,10 @@ static int hvmemul_write_cr(
>          break;
> 
>      case 3:
> -        rc = hvm_set_cr3(val, true);
> +        noflush = hvm_pcid_enabled(current) && (val & X86_CR3_NOFLUSH);
> +        if ( noflush )
> +            val &= ~X86_CR3_NOFLUSH;

... can't you just code this as:

if ( hvm_pcid_enabled(current) )
    val &= ~X86_CR3_NOFLUSH;

?

  Paul

> +        rc = hvm_set_cr3(val, noflush, true);
>          break;
> 
>      case 4:
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -2053,12 +2053,17 @@ int hvm_mov_to_cr(unsigned int cr, unsig
> 
>      switch ( cr )
>      {
> +        bool noflush;
> +
>      case 0:
>          rc = hvm_set_cr0(val, true);
>          break;
> 
>      case 3:
> -        rc = hvm_set_cr3(val, true);
> +        noflush = hvm_pcid_enabled(curr) && (val & X86_CR3_NOFLUSH);
> +        if ( noflush )
> +            val &= ~X86_CR3_NOFLUSH;
> +        rc = hvm_set_cr3(val, noflush, true);
>          break;
> 
>      case 4:
> @@ -2276,12 +2281,11 @@ int hvm_set_cr0(unsigned long value, boo
>      return X86EMUL_OKAY;
>  }
> 
> -int hvm_set_cr3(unsigned long value, bool may_defer)
> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
>  {
>      struct vcpu *v = current;
>      struct page_info *page;
>      unsigned long old = v->arch.hvm.guest_cr[3];
> -    bool noflush = false;
> 
>      if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
>                                 monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
> @@ -2293,17 +2297,12 @@ int hvm_set_cr3(unsigned long value, boo
>              /* The actual write will occur in hvm_do_resume(), if permitted. */
>              v->arch.vm_event->write_data.do_write.cr3 = 1;
>              v->arch.vm_event->write_data.cr3 = value;
> +            v->arch.vm_event->write_data.cr3_noflush = noflush;
> 
>              return X86EMUL_OKAY;
>          }
>      }
> 
> -    if ( hvm_pcid_enabled(v) ) /* Clear the noflush bit. */
> -    {
> -        noflush = value & X86_CR3_NOFLUSH;
> -        value &= ~X86_CR3_NOFLUSH;
> -    }
> -
>      if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
>           (value != v->arch.hvm.guest_cr[3]) )
>      {
> @@ -2998,7 +2997,7 @@ void hvm_task_switch(
>      if ( task_switch_load_seg(x86_seg_ldtr, tss.ldt, new_cpl, 0) )
>          goto out;
> 
> -    rc = hvm_set_cr3(tss.cr3, true);
> +    rc = hvm_set_cr3(tss.cr3, false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
>      if ( rc != X86EMUL_OKAY )
> --- a/xen/arch/x86/hvm/svm/nestedsvm.c
> +++ b/xen/arch/x86/hvm/svm/nestedsvm.c
> @@ -324,7 +324,7 @@ static int nsvm_vcpu_hostrestore(struct
>          v->arch.guest_table = pagetable_null();
>          /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
>      }
> -    rc = hvm_set_cr3(n1vmcb->_cr3, true);
> +    rc = hvm_set_cr3(n1vmcb->_cr3, false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
>      if (rc != X86EMUL_OKAY)
> @@ -584,7 +584,7 @@ static int nsvm_vmcb_prepare4vmrun(struc
>          nestedsvm_vmcb_set_nestedp2m(v, ns_vmcb, n2vmcb);
> 
>          /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
> -        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
> +        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
>          if ( rc == X86EMUL_EXCEPTION )
>              hvm_inject_hw_exception(TRAP_gp_fault, 0);
>          if (rc != X86EMUL_OKAY)
> @@ -598,7 +598,7 @@ static int nsvm_vmcb_prepare4vmrun(struc
>           * we assume it intercepts page faults.
>           */
>          /* hvm_set_cr3() below sets v->arch.hvm.guest_cr[3] for us. */
> -        rc = hvm_set_cr3(ns_vmcb->_cr3, true);
> +        rc = hvm_set_cr3(ns_vmcb->_cr3, false, true);
>          if ( rc == X86EMUL_EXCEPTION )
>              hvm_inject_hw_exception(TRAP_gp_fault, 0);
>          if (rc != X86EMUL_OKAY)
> --- a/xen/arch/x86/hvm/vm_event.c
> +++ b/xen/arch/x86/hvm/vm_event.c
> @@ -110,7 +110,7 @@ void hvm_vm_event_do_resume(struct vcpu
> 
>      if ( unlikely(w->do_write.cr3) )
>      {
> -        if ( hvm_set_cr3(w->cr3, false) == X86EMUL_EXCEPTION )
> +        if ( hvm_set_cr3(w->cr3, w->cr3_noflush, false) == X86EMUL_EXCEPTION )
>              hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
>          w->do_write.cr3 = 0;
> --- a/xen/arch/x86/hvm/vmx/vvmx.c
> +++ b/xen/arch/x86/hvm/vmx/vvmx.c
> @@ -1028,7 +1028,7 @@ static void load_shadow_guest_state(stru
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> -    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), true);
> +    rc = hvm_set_cr3(get_vvmcs(v, GUEST_CR3), false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> @@ -1242,7 +1242,7 @@ static void load_vvmcs_host_state(struct
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> -    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), true);
> +    rc = hvm_set_cr3(get_vvmcs(v, HOST_CR3), false, true);
>      if ( rc == X86EMUL_EXCEPTION )
>          hvm_inject_hw_exception(TRAP_gp_fault, 0);
> 
> --- a/xen/include/asm-x86/domain.h
> +++ b/xen/include/asm-x86/domain.h
> @@ -275,6 +275,8 @@ struct monitor_write_data {
>          unsigned int cr4 : 1;
>      } do_write;
> 
> +    bool cr3_noflush;
> +
>      uint32_t msr;
>      uint64_t value;
>      uint64_t cr0;
> --- a/xen/include/asm-x86/hvm/support.h
> +++ b/xen/include/asm-x86/hvm/support.h
> @@ -135,7 +135,7 @@ void hvm_shadow_handle_cd(struct vcpu *v
>   */
>  int hvm_set_efer(uint64_t value);
>  int hvm_set_cr0(unsigned long value, bool may_defer);
> -int hvm_set_cr3(unsigned long value, bool may_defer);
> +int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer);
>  int hvm_set_cr4(unsigned long value, bool may_defer);
>  int hvm_descriptor_access_intercept(uint64_t exit_info,
>                                      uint64_t vmx_exit_qualification,
> 
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()

Posted by Jan Beulich 62 weeks ago
>>> On 02.05.19 at 15:07, <Paul.Durrant@citrix.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 02 May 2019 13:20
>> 
>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -2072,6 +2072,8 @@ static int hvmemul_write_cr(
>>      HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
>>      switch ( reg )
>>      {
>> +        bool noflush;
>> +
> 
> Why introduce 'noflush' with this scope when it could be limited to 'case 
> 3:', although...

Because this would entail introducing another set of braces, and
I pretty much dislike these case-block braces: They either don't
properly indent (as we do commonly), or they needlessly increase
indentation of the enclosed block. Hence my general preference
of switch-scope local variables.

>> @@ -2082,7 +2084,10 @@ static int hvmemul_write_cr(
>>          break;
>> 
>>      case 3:
>> -        rc = hvm_set_cr3(val, true);
>> +        noflush = hvm_pcid_enabled(current) && (val & X86_CR3_NOFLUSH);
>> +        if ( noflush )
>> +            val &= ~X86_CR3_NOFLUSH;
> 
> ... can't you just code this as:
> 
> if ( hvm_pcid_enabled(current) )
>     val &= ~X86_CR3_NOFLUSH;
> 
> ?

Because of ...

>> +        rc = hvm_set_cr3(val, noflush, true);

... this further use of "noflush" (alongside the adjusted "val").

Jan



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()

Posted by Paul Durrant 62 weeks ago
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 02 May 2019 14:23
> To: Paul Durrant <Paul.Durrant@citrix.com>
> Cc: Andrew Cooper <Andrew.Cooper3@citrix.com>; George Dunlap <George.Dunlap@citrix.com>; Roger Pau
> Monne <roger.pau@citrix.com>; Wei Liu <wei.liu2@citrix.com>; xen-devel <xen-
> devel@lists.xenproject.org>
> Subject: RE: [PATCH 4/9] x86/HVM: move NOFLUSH handling out of hvm_set_cr3()
> 
> >>> On 02.05.19 at 15:07, <Paul.Durrant@citrix.com> wrote:
> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> Sent: 02 May 2019 13:20
> >>
> >> --- a/xen/arch/x86/hvm/emulate.c
> >> +++ b/xen/arch/x86/hvm/emulate.c
> >> @@ -2072,6 +2072,8 @@ static int hvmemul_write_cr(
> >>      HVMTRACE_LONG_2D(CR_WRITE, reg, TRC_PAR_LONG(val));
> >>      switch ( reg )
> >>      {
> >> +        bool noflush;
> >> +
> >
> > Why introduce 'noflush' with this scope when it could be limited to 'case
> > 3:', although...
> 
> Because this would entail introducing another set of braces, and
> I pretty much dislike these case-block braces: They either don't
> properly indent (as we do commonly), or they needlessly increase
> indentation of the enclosed block. Hence my general preference
> of switch-scope local variables.
> 
> >> @@ -2082,7 +2084,10 @@ static int hvmemul_write_cr(
> >>          break;
> >>
> >>      case 3:
> >> -        rc = hvm_set_cr3(val, true);
> >> +        noflush = hvm_pcid_enabled(current) && (val & X86_CR3_NOFLUSH);
> >> +        if ( noflush )
> >> +            val &= ~X86_CR3_NOFLUSH;
> >
> > ... can't you just code this as:
> >
> > if ( hvm_pcid_enabled(current) )
> >     val &= ~X86_CR3_NOFLUSH;
> >
> > ?
> 
> Because of ...
> 
> >> +        rc = hvm_set_cr3(val, noflush, true);
> 
> ... this further use of "noflush" (alongside the adjusted "val").
> 

Ah, missed that... I'd still go for the tighter scope though, but then I don't mind the extra braces.

  Paul

> Jan
> 


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH 5/9] x86/HVM: refuse CR3 loads with reserved (upper) bits set

Posted by Jan Beulich 62 weeks ago
While bits 11 and below are, it not used for other purposes, reserved
but ignored, bits beyond physical address width are supposed to raise
exceptions (at least in the non-nested case; I'm not convinced the
current nested SVM/VMX behavior of raising #GP(0) here is correct, but
that's not the subject of this change).

Introduce currd as a local variable, and replace other v->domain
instances at the same time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1003,6 +1003,13 @@ static int hvm_load_cpu_ctxt(struct doma
         return -EINVAL;
     }
 
+    if ( ctxt.cr3 & ~((1UL << d->arch.cpuid->extd.maxphysaddr) - 1) )
+    {
+        printk(XENLOG_G_ERR "HVM%d restore: bad CR3 %#" PRIx64 "\n",
+               d->domain_id, ctxt.cr3);
+        return X86EMUL_EXCEPTION;
+    }
+
     if ( (ctxt.flags & ~XEN_X86_FPU_INITIALISED) != 0 )
     {
         gprintk(XENLOG_ERR, "bad flags value in CPU context: %#x\n",
@@ -2284,10 +2291,19 @@ int hvm_set_cr0(unsigned long value, boo
 int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
 {
     struct vcpu *v = current;
+    struct domain *currd = v->domain;
     struct page_info *page;
     unsigned long old = v->arch.hvm.guest_cr[3];
 
-    if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled &
+    if ( value & ~((1UL << currd->arch.cpuid->extd.maxphysaddr) - 1) )
+    {
+        HVM_DBG_LOG(DBG_LEVEL_1,
+                    "Attempt to set reserved CR3 bit(s): %lx",
+                    value);
+        return X86EMUL_EXCEPTION;
+    }
+
+    if ( may_defer && unlikely(currd->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
     {
         ASSERT(v->arch.vm_event);
@@ -2303,13 +2319,12 @@ int hvm_set_cr3(unsigned long value, boo
         }
     }
 
-    if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) &&
+    if ( hvm_paging_enabled(v) && !paging_mode_hap(currd) &&
          (value != v->arch.hvm.guest_cr[3]) )
     {
         /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
-        page = get_page_from_gfn(v->domain, value >> PAGE_SHIFT,
-                                 NULL, P2M_ALLOC);
+        page = get_page_from_gfn(currd, value >> PAGE_SHIFT, NULL, P2M_ALLOC);
         if ( !page )
             goto bad_cr3;
 
@@ -2325,7 +2340,7 @@ int hvm_set_cr3(unsigned long value, boo
 
  bad_cr3:
     gdprintk(XENLOG_ERR, "Invalid CR3\n");
-    domain_crash(v->domain);
+    domain_crash(currd);
     return X86EMUL_UNHANDLEABLE;
 }
 





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH 6/9] x86/HVM: relax shadow mode check in hvm_set_cr3()

Posted by Jan Beulich 62 weeks ago
There's no need to re-obtain a page reference if only bits not affecting
the address change.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2320,7 +2320,7 @@ int hvm_set_cr3(unsigned long value, boo
     }
 
     if ( hvm_paging_enabled(v) && !paging_mode_hap(currd) &&
-         (value != v->arch.hvm.guest_cr[3]) )
+         ((value ^ v->arch.hvm.guest_cr[3]) >> PAGE_SHIFT) )
     {
         /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH 7/9] x86/HVM: cosmetics to hvm_set_cr3()

Posted by Jan Beulich 62 weeks ago
Eliminate the not really useful local variable "old". Reduce the scope
of "page". Rename the latched "current".

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2290,10 +2290,8 @@ int hvm_set_cr0(unsigned long value, boo
 
 int hvm_set_cr3(unsigned long value, bool noflush, bool may_defer)
 {
-    struct vcpu *v = current;
-    struct domain *currd = v->domain;
-    struct page_info *page;
-    unsigned long old = v->arch.hvm.guest_cr[3];
+    struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
 
     if ( value & ~((1UL << currd->arch.cpuid->extd.maxphysaddr) - 1) )
     {
@@ -2306,36 +2304,38 @@ int hvm_set_cr3(unsigned long value, boo
     if ( may_defer && unlikely(currd->arch.monitor.write_ctrlreg_enabled &
                                monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) )
     {
-        ASSERT(v->arch.vm_event);
+        ASSERT(curr->arch.vm_event);
 
-        if ( hvm_monitor_crX(CR3, value, old) )
+        if ( hvm_monitor_crX(CR3, value, curr->arch.hvm.guest_cr[3]) )
         {
             /* The actual write will occur in hvm_do_resume(), if permitted. */
-            v->arch.vm_event->write_data.do_write.cr3 = 1;
-            v->arch.vm_event->write_data.cr3 = value;
-            v->arch.vm_event->write_data.cr3_noflush = noflush;
+            curr->arch.vm_event->write_data.do_write.cr3 = 1;
+            curr->arch.vm_event->write_data.cr3 = value;
+            curr->arch.vm_event->write_data.cr3_noflush = noflush;
 
             return X86EMUL_OKAY;
         }
     }
 
-    if ( hvm_paging_enabled(v) && !paging_mode_hap(currd) &&
-         ((value ^ v->arch.hvm.guest_cr[3]) >> PAGE_SHIFT) )
+    if ( hvm_paging_enabled(curr) && !paging_mode_hap(currd) &&
+         ((value ^ curr->arch.hvm.guest_cr[3]) >> PAGE_SHIFT) )
     {
         /* Shadow-mode CR3 change. Check PDBR and update refcounts. */
+        struct page_info *page;
+
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value);
         page = get_page_from_gfn(currd, value >> PAGE_SHIFT, NULL, P2M_ALLOC);
         if ( !page )
             goto bad_cr3;
 
-        put_page(pagetable_get_page(v->arch.guest_table));
-        v->arch.guest_table = pagetable_from_page(page);
+        put_page(pagetable_get_page(curr->arch.guest_table));
+        curr->arch.guest_table = pagetable_from_page(page);
 
         HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value);
     }
 
-    v->arch.hvm.guest_cr[3] = value;
-    paging_update_cr3(v, noflush);
+    curr->arch.hvm.guest_cr[3] = value;
+    paging_update_cr3(curr, noflush);
     return X86EMUL_OKAY;
 
  bad_cr3:





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH 8/9] x86/CPUID: drop INVPCID dependency on PCID

Posted by Jan Beulich 62 weeks ago
PCID validly depends on LM, as it can be enabled in Long Mode only.
INVPCID, otoh, can be used not only without PCID enabled, but also
outside of Long Mode altogether. In both cases its functionality is
simply restricted to PCID 0, which is sort of expected as no other PCID
can be activated there.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -217,10 +217,6 @@ def crunch_numbers(state):
         #
         # SSE4_2: [POPCNT]
 
-        # The INVPCID instruction depends on PCID infrastructure being
-        # available.
-        PCID: [INVPCID],
-
         # XSAVE is an extra set of instructions for state management, but
         # doesn't constitue new state itself.  Some of the dependent features
         # are instructions built on top of base XSAVE, while others are new



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH 9/9] x86: PCID is unused when !PV

Posted by Jan Beulich 62 weeks ago
This allows in particular some streamlining of the TLB flushing code
paths.

Signed-off-by: Jan Beulich <jbeulich@suse.com>

--- a/xen/arch/x86/flushtlb.c
+++ b/xen/arch/x86/flushtlb.c
@@ -24,6 +24,11 @@
 #define WRAP_MASK (0x000003FFU)
 #endif
 
+#ifndef CONFIG_PV
+# undef X86_CR4_PCIDE
+# define X86_CR4_PCIDE 0
+#endif
+
 u32 tlbflush_clock = 1U;
 DEFINE_PER_CPU(u32, tlbflush_time);
 
--- a/xen/include/asm-x86/processor.h
+++ b/xen/include/asm-x86/processor.h
@@ -294,7 +294,11 @@ static inline unsigned long cr3_pa(unsig
 
 static inline unsigned int cr3_pcid(unsigned long cr3)
 {
+#ifdef CONFIG_PV
     return cr3 & X86_CR3_PCID_MASK;
+#else
+    return 0;
+#endif
 }
 
 static inline unsigned long read_cr4(void)
@@ -306,8 +310,12 @@ static inline void write_cr4(unsigned lo
 {
     struct cpu_info *info = get_cpu_info();
 
+#ifdef CONFIG_PV
     /* No global pages in case of PCIDs enabled! */
     ASSERT(!(val & X86_CR4_PGE) || !(val & X86_CR4_PCIDE));
+#else
+    ASSERT(!(val & X86_CR4_PCIDE));
+#endif
 
     /*
      * On hardware supporting FSGSBASE, the value in %cr4 is the kernel's
--- a/xen/include/asm-x86/pv/domain.h
+++ b/xen/include/asm-x86/pv/domain.h
@@ -50,8 +50,13 @@
  */
 static inline unsigned long get_pcid_bits(const struct vcpu *v, bool is_xpti)
 {
+#ifdef CONFIG_PV
     return X86_CR3_NOFLUSH | (is_xpti ? PCID_PV_XPTI : 0) |
            ((v->arch.flags & TF_kernel_mode) ? PCID_PV_PRIV : PCID_PV_USER);
+#else
+    ASSERT_UNREACHABLE();
+    return 0;
+#endif
 }
 
 #ifdef CONFIG_PV





_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel