[Xen-devel] [PATCH v4 0/7] x86/HVM: implement memory read caching

Jan Beulich posted 7 patches 4 years, 2 months ago
Only 0 patches received!
There is a newer version of this series
[Xen-devel] [PATCH v4 0/7] x86/HVM: implement memory read caching
Posted by Jan Beulich 4 years, 2 months ago
Emulation requiring device model assistance uses a form of instruction
re-execution, assuming that the second (and any further) pass takes
exactly the same path. This is a valid assumption as far as use of CPU
registers goes (as those can't change without any other instruction
executing in between), but is wrong for memory accesses. In particular
it has been observed that Windows might page out buffers underneath
an instruction currently under emulation (hitting between two passes).
If the first pass translated a linear address successfully, any subsequent
pass needs to do so too, yielding the exact same translation.

Introduce a cache to make sure above described assumption holds. This
is a very simplistic implementation for now: Only exact matches are
satisfied (no overlaps or partial reads or anything).

There's also some perhaps seemingly unrelated cleanup here which was
found desirable on the way - the 3 initial patches are truly prereqs
(at least in a contextual way), while the 2 last ones are just for
things noticed along the way.

1: SVM: drop asm/hvm/emulate.h inclusion from vmcb.h
2: x86/HVM: rename a variable in __hvm_copy()
3: x86/HVM: introduce "curr" into hvmemul_rep_{mov,sto}s()
4: x86/HVM: implement memory read caching for insn emulation
5: x86/mm: use cache in guest_walk_tables()
6: x86/mm: drop p2mt parameter from map_domain_gfn()
7: x86/HVM: reduce scope of pfec in hvm_emulate_init_per_insn()

Compared to v3 this is a major re-work to avoid passing around
"cache" arguments, as is my understanding of the main feedback
aspect for v3. I've also dropped (at least for the time being)
add-on patches to seed the cache with PAE PDPTE values.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 1/7] SVM: drop asm/hvm/emulate.h inclusion from vmcb.h
Posted by Jan Beulich 4 years, 2 months ago
It's not needed there and introduces a needless, almost global
dependency. Include the file (or in some cases just xen/err.h) where
actually needed, or - in one case - simply forward-declare a struct. In
microcode*.c take the opportunity and also re-order a few other
#include-s.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -55,6 +55,7 @@
 #include <asm/mc146818rtc.h>
 #include <asm/mce.h>
 #include <asm/monitor.h>
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/vpt.h>
 #include <asm/hvm/support.h>
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -28,6 +28,7 @@
 #include <xen/paging.h>
 #include <xen/vpci.h>
 
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/ioreq.h>
 #include <asm/hvm/vmx/vmx.h>
--- a/xen/arch/x86/hvm/svm/emulate.c
+++ b/xen/arch/x86/hvm/svm/emulate.c
@@ -20,6 +20,7 @@
 #include <xen/lib.h>
 #include <xen/trace.h>
 #include <asm/msr.h>
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/svm/svm.h>
--- a/xen/arch/x86/hvm/vm_event.c
+++ b/xen/arch/x86/hvm/vm_event.c
@@ -22,6 +22,7 @@
 
 #include <xen/sched.h>
 #include <xen/vm_event.h>
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/support.h>
 #include <asm/vm_event.h>
 
--- a/xen/arch/x86/microcode.c
+++ b/xen/arch/x86/microcode.c
@@ -22,9 +22,10 @@
  */
 
 #include <xen/cpu.h>
-#include <xen/lib.h>
-#include <xen/kernel.h>
+#include <xen/err.h>
 #include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/lib.h>
 #include <xen/notifier.h>
 #include <xen/sched.h>
 #include <xen/smp.h>
--- a/xen/arch/x86/microcode_amd.c
+++ b/xen/arch/x86/microcode_amd.c
@@ -14,9 +14,10 @@
  *  License version 2. See file COPYING for details.
  */
 
-#include <xen/lib.h>
-#include <xen/kernel.h>
+#include <xen/err.h>
 #include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/lib.h>
 #include <xen/sched.h>
 #include <xen/smp.h>
 #include <xen/spinlock.h>
--- a/xen/arch/x86/microcode_intel.c
+++ b/xen/arch/x86/microcode_intel.c
@@ -21,9 +21,10 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#include <xen/lib.h>
-#include <xen/kernel.h>
+#include <xen/err.h>
 #include <xen/init.h>
+#include <xen/kernel.h>
+#include <xen/lib.h>
 #include <xen/sched.h>
 #include <xen/smp.h>
 #include <xen/spinlock.h>
--- a/xen/arch/x86/mm/shadow/hvm.c
+++ b/xen/arch/x86/mm/shadow/hvm.c
@@ -28,6 +28,7 @@
 #include <xen/trace.h>
 
 #include <asm/current.h>
+#include <asm/hvm/emulate.h>
 #include <asm/shadow.h>
 
 #include "private.h"
--- a/xen/arch/x86/pv/emul-gate-op.c
+++ b/xen/arch/x86/pv/emul-gate-op.c
@@ -19,6 +19,7 @@
  * along with this program; If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <xen/err.h>
 #include <xen/errno.h>
 #include <xen/event.h>
 #include <xen/guest_access.h>
--- a/xen/include/asm-x86/hvm/svm/vmcb.h
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h
@@ -20,8 +20,6 @@
 #define __ASM_X86_HVM_SVM_VMCB_H__
 
 #include <xen/types.h>
-#include <asm/hvm/emulate.h>
-
 
 /* general 1 intercepts */
 enum GenericIntercept1bits
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -97,6 +97,7 @@ void vmx_asm_do_vmentry(void);
 void vmx_intr_assist(void);
 void noreturn vmx_do_resume(struct vcpu *);
 void vmx_vlapic_msr_changed(struct vcpu *v);
+struct hvm_emulate_ctxt;
 void vmx_realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt);
 void vmx_realmode(struct cpu_user_regs *regs);
 void vmx_update_debug_state(struct vcpu *v);


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 1/7] SVM: drop asm/hvm/emulate.h inclusion from vmcb.h
Posted by Andrew Cooper 4 years, 2 months ago
On 31/01/2020 16:42, Jan Beulich wrote:
> It's not needed there and introduces a needless, almost global
> dependency. Include the file (or in some cases just xen/err.h) where
> actually needed, or - in one case - simply forward-declare a struct. In
> microcode*.c take the opportunity and also re-order a few other
> #include-s.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 1/7] SVM: drop asm/hvm/emulate.h inclusion from vmcb.h
Posted by Alexandru Stefan ISAILA 4 years, 2 months ago

On 31.01.2020 18:42, Jan Beulich wrote:
> It's not needed there and introduces a needless, almost global
> dependency. Include the file (or in some cases just xen/err.h) where
> actually needed, or - in one case - simply forward-declare a struct. In
> microcode*.c take the opportunity and also re-order a few other
> #include-s.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Alexandru Isaila <aisaila@bitdefender.com>
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 1/7] SVM: drop asm/hvm/emulate.h inclusion from vmcb.h
Posted by Tian, Kevin 4 years, 2 months ago
> From: Jan Beulich <jbeulich@suse.com>
> Sent: Saturday, February 1, 2020 12:42 AM
> 
> It's not needed there and introduces a needless, almost global
> dependency. Include the file (or in some cases just xen/err.h) where
> actually needed, or - in one case - simply forward-declare a struct. In
> microcode*.c take the opportunity and also re-order a few other
> #include-s.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Kevin Tian <kevin.tian@intel.com>
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 2/7] x86/HVM: rename a variable in __hvm_copy()
Posted by Jan Beulich 4 years, 2 months ago
This is to reflect its actual purpose. Also use in a 2nd place.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3255,9 +3255,9 @@ static enum hvm_translation_result __hvm
     while ( todo > 0 )
     {
         enum hvm_translation_result res;
-        paddr_t gpa = addr & ~PAGE_MASK;
+        unsigned int pgoff = addr & ~PAGE_MASK;
 
-        count = min_t(int, PAGE_SIZE - gpa, todo);
+        count = min_t(int, PAGE_SIZE - pgoff, todo);
 
         res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear,
                                      pfec, pfinfo, &page, &gfn, &p2mt);
@@ -3279,7 +3279,7 @@ static enum hvm_translation_result __hvm
             return HVMTRANS_need_retry;
         }
 
-        p = (char *)__map_domain_page(page) + (addr & ~PAGE_MASK);
+        p = __map_domain_page(page) + pgoff;
 
         if ( flags & HVMCOPY_to_guest )
         {


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 2/7] x86/HVM: rename a variable in __hvm_copy()
Posted by Andrew Cooper 4 years, 2 months ago
On 31/01/2020 16:42, Jan Beulich wrote:
> This is to reflect its actual purpose. Also use in a 2nd place.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 3/7] x86/HVM: introduce "curr" into hvmemul_rep_{mov, sto}s()
Posted by Jan Beulich 4 years, 2 months ago
There are a number of uses of "current" already, and more may appear
down the road. Latch into a local variable.

At this occasion also drop stray casts from code getting touched anyway.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1747,7 +1747,8 @@ static int hvmemul_rep_movs(
 {
     struct hvm_emulate_ctxt *hvmemul_ctxt =
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
-    struct hvm_vcpu_io *vio = &current->arch.hvm.hvm_io;
+    struct vcpu *curr = current;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm.hvm_io;
     unsigned long saddr, daddr, bytes;
     paddr_t sgpa, dgpa;
     uint32_t pfec = PFEC_page_present;
@@ -1807,8 +1808,8 @@ static int hvmemul_rep_movs(
     }
 
     /* Check for MMIO ops */
-    (void) get_gfn_query_unlocked(current->domain, sgpa >> PAGE_SHIFT, &sp2mt);
-    (void) get_gfn_query_unlocked(current->domain, dgpa >> PAGE_SHIFT, &dp2mt);
+    get_gfn_query_unlocked(curr->domain, sgpa >> PAGE_SHIFT, &sp2mt);
+    get_gfn_query_unlocked(curr->domain, dgpa >> PAGE_SHIFT, &dp2mt);
 
     if ( sp2mt == p2m_mmio_direct || dp2mt == p2m_mmio_direct ||
          (sp2mt == p2m_mmio_dm && dp2mt == p2m_mmio_dm) )
@@ -1873,7 +1874,7 @@ static int hvmemul_rep_movs(
         rc = hvm_copy_from_guest_phys(buf, sgpa, bytes);
 
     if ( rc == HVMTRANS_okay )
-        rc = hvm_copy_to_guest_phys(dgpa, buf, bytes, current);
+        rc = hvm_copy_to_guest_phys(dgpa, buf, bytes, curr);
 
     xfree(buf);
 
@@ -1910,7 +1911,8 @@ static int hvmemul_rep_stos(
 {
     struct hvm_emulate_ctxt *hvmemul_ctxt =
         container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
-    struct hvm_vcpu_io *vio = &current->arch.hvm.hvm_io;
+    struct vcpu *curr = current;
+    struct hvm_vcpu_io *vio = &curr->arch.hvm.hvm_io;
     unsigned long addr, bytes;
     paddr_t gpa;
     p2m_type_t p2mt;
@@ -1943,7 +1945,7 @@ static int hvmemul_rep_stos(
     }
 
     /* Check for MMIO op */
-    (void)get_gfn_query_unlocked(current->domain, gpa >> PAGE_SHIFT, &p2mt);
+    get_gfn_query_unlocked(curr->domain, gpa >> PAGE_SHIFT, &p2mt);
 
     switch ( p2mt )
     {
@@ -1992,7 +1994,7 @@ static int hvmemul_rep_stos(
         if ( df )
             gpa -= bytes - bytes_per_rep;
 
-        rc = hvm_copy_to_guest_phys(gpa, buf, bytes, current);
+        rc = hvm_copy_to_guest_phys(gpa, buf, bytes, curr);
 
         if ( buf != p_data )
             xfree(buf);


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 3/7] x86/HVM: introduce "curr" into hvmemul_rep_{mov, sto}s()
Posted by Andrew Cooper 4 years, 2 months ago
On 31/01/2020 16:43, Jan Beulich wrote:
> There are a number of uses of "current" already, and more may appear
> down the road. Latch into a local variable.
>
> At this occasion also drop stray casts from code getting touched anyway.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 4/7] x86/HVM: implement memory read caching for insn emulation
Posted by Jan Beulich 4 years, 2 months ago
Emulation requiring device model assistance uses a form of instruction
re-execution, assuming that the second (and any further) pass takes
exactly the same path. This is a valid assumption as far as use of CPU
registers goes (as those can't change without any other instruction
executing in between), but is wrong for memory accesses. In particular
it has been observed that Windows might page out buffers underneath an
instruction currently under emulation (hitting between two passes). If
the first pass read a memory operand successfully, any subsequent pass
needs to get to see the exact same value.

Introduce a cache to make sure above described assumption holds. This
is a very simplistic implementation for now: Only exact matches are
satisfied (no overlaps or partial reads or anything); this is sufficient
for the immediate purpose of making re-execution an exact replay. The
cache also won't be used just yet for guest page walks; that'll be the
subject of a subsequent change.

With the cache being generally transparent to upper layers, but with it
having limited capacity yet being required for correctness, certain
users of hvm_copy_from_guest_*() need to disable caching temporarily,
without invalidating the cache. Note that the adjustments here to
hvm_hypercall() and hvm_task_switch() are benign at this point; they'll
become relevant once we start to be able to emulate respective insns
through the main emulator (and more changes will then likely be needed
to nested code).

As to the actual data page in this scenario, there are a couple of
aspects to take into consideration:
- We must be talking about an insn accessing two locations (two memory
  ones, one of which is MMIO, or a memory and an I/O one).
- If the non I/O / MMIO side is being read, the re-read (if it occurs at
  all) is having its result discarded, by taking the shortcut through
  the first switch()'s STATE_IORESP_READY case in hvmemul_do_io(). Note
  how, among all the re-issue sanity checks there, we avoid comparing
  the actual data.
- If the non I/O / MMIO side is being written, it is the OSes
  responsibility to avoid actually moving page contents to disk while
  there might still be a write access in flight - this is no different
  in behavior from bare hardware.
- Read-modify-write accesses are, as always, complicated, and while we
  deal with them better nowadays than we did in the past, we're still
  not quite there to guarantee hardware like behavior in all cases
  anyway. Nothing is getting worse by the changes made here, afaict.

In __hvm_copy() also reduce p's scope and change its type to void *.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: In principle the caching here yields unnecessary the one used for
     insn bytes (vio->mmio_insn{,_bytes}. However, to seed the cache
     with the data SVM may have made available, we'd have to also know
     the corresponding GPA. It's not safe, however, to re-walk the page
     tables to find out, as the page tables may have changed in the
     meantime. Therefore I guess we need to keep the duplicate
     functionality for now. A possible solution to this could be to use
     a physical-address-based cache for page table accesses (and looking
     forward also e.g. SVM/VMX insn emulation), and a linear-address-
     based one for all other reads.
---
v4: Re-write for cache to become transparent to callers.
v3: Add text about the actual data page to the description.
v2: Re-base.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -28,6 +28,17 @@
 #include <asm/iocap.h>
 #include <asm/vm_event.h>
 
+struct hvmemul_cache
+{
+    unsigned int num_ents;
+    unsigned int max_ents;
+    struct {
+        paddr_t gpa:PADDR_BITS;
+        unsigned int size:BITS_PER_LONG - PADDR_BITS;
+        unsigned long data;
+    } ents[];
+};
+
 static void hvmtrace_io_assist(const ioreq_t *p)
 {
     unsigned int size, event;
@@ -1866,12 +1877,17 @@ static int hvmemul_rep_movs(
         rc = HVMTRANS_okay;
     }
     else
+    {
+        unsigned int token = hvmemul_cache_disable(curr);
+
         /*
          * We do a modicum of checking here, just for paranoia's sake and to
          * definitely avoid copying an unitialised buffer into guest address
          * space.
          */
         rc = hvm_copy_from_guest_phys(buf, sgpa, bytes);
+        hvmemul_cache_restore(curr, token);
+    }
 
     if ( rc == HVMTRANS_okay )
         rc = hvm_copy_to_guest_phys(dgpa, buf, bytes, curr);
@@ -2534,6 +2550,19 @@ static int _hvm_emulate_one(struct hvm_e
     struct hvm_vcpu_io *vio = &curr->arch.hvm.hvm_io;
     int rc;
 
+    /*
+     * Enable caching if it's currently disabled, but leave the cache
+     * untouched if it's already enabled, for re-execution to consume
+     * entries populated by an earlier pass.
+     */
+    if ( vio->cache->num_ents > vio->cache->max_ents )
+    {
+        ASSERT(vio->io_req.state == STATE_IOREQ_NONE);
+        vio->cache->num_ents = 0;
+    }
+    else
+        ASSERT(vio->io_req.state == STATE_IORESP_READY);
+
     hvm_emulate_init_per_insn(hvmemul_ctxt, vio->mmio_insn,
                               vio->mmio_insn_bytes);
 
@@ -2547,6 +2576,7 @@ static int _hvm_emulate_one(struct hvm_e
     {
         vio->mmio_cache_count = 0;
         vio->mmio_insn_bytes = 0;
+        hvmemul_cache_disable(curr);
     }
     else
     {
@@ -2838,6 +2868,123 @@ void hvm_dump_emulation_state(const char
            hvmemul_ctxt->insn_buf);
 }
 
+int hvmemul_cache_init(struct vcpu *v)
+{
+    /*
+     * No insn can access more than 16 independent linear addresses (AVX512F
+     * scatters/gathers being the worst). Each such linear range can span a
+     * page boundary, i.e. may require two page walks. Account for each insn
+     * byte individually.
+     */
+    const unsigned int nents = (CONFIG_PAGING_LEVELS + 1) *
+                               (MAX_INST_LEN + 16 * 2);
+    struct hvmemul_cache *cache = xmalloc_flex_struct(struct hvmemul_cache,
+                                                      ents, nents);
+
+    if ( !cache )
+        return -ENOMEM;
+
+    /* Cache is disabled initially. */
+    cache->num_ents = nents + 1;
+    cache->max_ents = nents;
+
+    v->arch.hvm.hvm_io.cache = cache;
+
+    return 0;
+}
+
+unsigned int hvmemul_cache_disable(struct vcpu *v)
+{
+    struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
+    unsigned int token = cache->num_ents;
+
+    cache->num_ents = cache->max_ents + 1;
+
+    return token;
+}
+
+void hvmemul_cache_restore(struct vcpu *v, unsigned int token)
+{
+    struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
+
+    ASSERT(cache->num_ents > cache->max_ents);
+    cache->num_ents = token;
+}
+
+bool hvmemul_read_cache(const struct vcpu *v, paddr_t gpa,
+                        void *buffer, unsigned int size)
+{
+    const struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
+    unsigned int i;
+
+    /* Cache unavailable? */
+    if ( cache->num_ents > cache->max_ents )
+        return false;
+
+    while ( size > sizeof(cache->ents->data) )
+    {
+        i = gpa & (sizeof(cache->ents->data) - 1)
+            ? -gpa & (sizeof(cache->ents->data) - 1)
+            : sizeof(cache->ents->data);
+        if ( !hvmemul_read_cache(v, gpa, buffer, i) )
+            return false;
+        gpa += i;
+        buffer += i;
+        size -= i;
+    }
+
+    for ( i = 0; i < cache->num_ents; ++i )
+        if ( cache->ents[i].gpa == gpa && cache->ents[i].size == size )
+        {
+            memcpy(buffer, &cache->ents[i].data, size);
+            return true;
+        }
+
+    return false;
+}
+
+void hvmemul_write_cache(const struct vcpu *v, paddr_t gpa,
+                         const void *buffer, unsigned int size)
+{
+    struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
+    unsigned int i;
+
+    /* Cache unavailable? */
+    if ( cache->num_ents > cache->max_ents )
+        return;
+
+    while ( size > sizeof(cache->ents->data) )
+    {
+        i = gpa & (sizeof(cache->ents->data) - 1)
+            ? -gpa & (sizeof(cache->ents->data) - 1)
+            : sizeof(cache->ents->data);
+        hvmemul_write_cache(v, gpa, buffer, i);
+        gpa += i;
+        buffer += i;
+        size -= i;
+    }
+
+    for ( i = 0; i < cache->num_ents; ++i )
+        if ( cache->ents[i].gpa == gpa && cache->ents[i].size == size )
+        {
+            memcpy(&cache->ents[i].data, buffer, size);
+            return;
+        }
+
+    if ( unlikely(i >= cache->max_ents) )
+    {
+        ASSERT_UNREACHABLE();
+        return;
+    }
+
+    cache->ents[i].gpa  = gpa;
+    cache->ents[i].size = size;
+
+    memcpy(&cache->ents[i].data, buffer, size);
+
+    cache->num_ents = i + 1;
+}
+
 /*
  * Local variables:
  * mode: C
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -717,6 +717,8 @@ int hvm_domain_initialise(struct domain
 /* This function and all its descendants need to be to be idempotent. */
 void hvm_domain_relinquish_resources(struct domain *d)
 {
+    struct vcpu *v;
+
     if ( hvm_funcs.domain_relinquish_resources )
         alternative_vcall(hvm_funcs.domain_relinquish_resources, d);
 
@@ -733,6 +735,9 @@ void hvm_domain_relinquish_resources(str
     rtc_deinit(d);
     pmtimer_deinit(d);
     hpet_deinit(d);
+
+    for_each_vcpu ( d, v )
+        hvmemul_cache_destroy(v);
 }
 
 void hvm_domain_destroy(struct domain *d)
@@ -1538,6 +1543,10 @@ int hvm_vcpu_initialise(struct vcpu *v)
 
     v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
 
+    rc = hvmemul_cache_init(v);
+    if ( rc )
+        goto fail4;
+
     rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */
     if ( rc != 0 )
         goto fail4;
@@ -1573,6 +1582,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
  fail5:
     free_compat_arg_xlat(v);
  fail4:
+    hvmemul_cache_destroy(v);
     hvm_funcs.vcpu_destroy(v);
  fail3:
     vlapic_destroy(v);
@@ -2934,6 +2944,7 @@ void hvm_task_switch(
     unsigned int eflags, new_cpl;
     pagefault_info_t pfinfo;
     int exn_raised, rc;
+    unsigned int token = hvmemul_cache_disable(v);
     struct tss32 tss;
 
     hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
@@ -3141,6 +3152,8 @@ void hvm_task_switch(
  out:
     hvm_unmap_entry(optss_desc);
     hvm_unmap_entry(nptss_desc);
+
+    hvmemul_cache_restore(v, token);
 }
 
 enum hvm_translation_result hvm_translate_get_page(
@@ -3231,7 +3244,6 @@ static enum hvm_translation_result __hvm
     gfn_t gfn;
     struct page_info *page;
     p2m_type_t p2mt;
-    char *p;
     int count, todo = size;
 
     ASSERT(is_hvm_vcpu(v));
@@ -3279,11 +3291,17 @@ static enum hvm_translation_result __hvm
             return HVMTRANS_need_retry;
         }
 
-        p = __map_domain_page(page) + pgoff;
-
-        if ( flags & HVMCOPY_to_guest )
+        if ( (flags & HVMCOPY_to_guest) ||
+             !hvmemul_read_cache(v, gfn_to_gaddr(gfn) | pgoff, buf, count) )
         {
-            if ( p2m_is_discard_write(p2mt) )
+            void *p = __map_domain_page(page) + pgoff;
+
+            if ( !(flags & HVMCOPY_to_guest) )
+            {
+                memcpy(buf, p, count);
+                hvmemul_write_cache(v, gfn_to_gaddr(gfn) | pgoff, buf, count);
+            }
+            else if ( p2m_is_discard_write(p2mt) )
             {
                 static unsigned long lastpage;
 
@@ -3300,13 +3318,9 @@ static enum hvm_translation_result __hvm
                     memset(p, 0, count);
                 paging_mark_pfn_dirty(v->domain, _pfn(gfn_x(gfn)));
             }
-        }
-        else
-        {
-            memcpy(buf, p, count);
-        }
 
-        unmap_domain_page(p);
+            unmap_domain_page(p);
+        }
 
         addr += count;
         if ( buf )
--- a/xen/arch/x86/hvm/hypercall.c
+++ b/xen/arch/x86/hvm/hypercall.c
@@ -22,6 +22,7 @@
 #include <xen/hypercall.h>
 #include <xen/nospec.h>
 
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/support.h>
 
 static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
@@ -159,6 +160,7 @@ int hvm_hypercall(struct cpu_user_regs *
     struct domain *currd = curr->domain;
     int mode = hvm_guest_x86_mode(curr);
     unsigned long eax = regs->eax;
+    unsigned int token;
 
     switch ( mode )
     {
@@ -183,7 +185,18 @@ int hvm_hypercall(struct cpu_user_regs *
     }
 
     if ( (eax & 0x80000000) && is_viridian_domain(currd) )
-        return viridian_hypercall(regs);
+    {
+        int ret;
+
+        /* See comment below. */
+        token = hvmemul_cache_disable(curr);
+
+        ret = viridian_hypercall(regs);
+
+        hvmemul_cache_restore(curr, token);
+
+        return ret;
+    }
 
     BUILD_BUG_ON(ARRAY_SIZE(hvm_hypercall_table) >
                  ARRAY_SIZE(hypercall_args_table));
@@ -202,6 +215,12 @@ int hvm_hypercall(struct cpu_user_regs *
         return HVM_HCALL_completed;
     }
 
+    /*
+     * Caching is intended for instruction emulation only. Disable it
+     * for any accesses by hypercall argument copy-in / copy-out.
+     */
+    token = hvmemul_cache_disable(curr);
+
     curr->hcall_preempted = false;
 
     if ( mode == 8 )
@@ -295,6 +314,8 @@ int hvm_hypercall(struct cpu_user_regs *
 #endif
     }
 
+    hvmemul_cache_restore(curr, token);
+
     HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%lu -> %lx", eax, regs->rax);
 
     if ( curr->hcall_preempted )
--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -20,6 +20,7 @@
 #include <xen/types.h>
 #include <xen/sched.h>
 #include <asm/regs.h>
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/domain.h>
@@ -163,6 +164,9 @@ int hvm_process_io_intercept(const struc
         {
             if ( p->data_is_ptr )
             {
+                struct vcpu *curr = current;
+                unsigned int token = hvmemul_cache_disable(curr);
+
                 data = 0;
                 switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
                                                   p->size) )
@@ -179,9 +183,11 @@ int hvm_process_io_intercept(const struc
                     ASSERT_UNREACHABLE();
                     /* fall through */
                 default:
-                    domain_crash(current->domain);
+                    domain_crash(curr->domain);
                     return X86EMUL_UNHANDLEABLE;
                 }
+
+                hvmemul_cache_restore(curr, token);
             }
             else
                 data = p->data;
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1040,6 +1040,8 @@ void svm_vmenter_helper(const struct cpu
     struct vcpu *curr = current;
     struct vmcb_struct *vmcb = curr->arch.hvm.svm.vmcb;
 
+    ASSERT(hvmemul_cache_disabled(curr));
+
     svm_asid_handle_vmrun();
 
     if ( unlikely(tb_init_done) )
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -35,6 +35,7 @@
 #include <xen/irq.h>
 #include <xen/vpci.h>
 #include <public/hvm/ioreq.h>
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/io.h>
 #include <asm/hvm/vpic.h>
 #include <asm/hvm/vlapic.h>
@@ -607,6 +608,7 @@ void msix_write_completion(struct vcpu *
     if ( !ctrl_address && snoop_addr &&
          v->arch.hvm.hvm_io.msix_snoop_gpa )
     {
+        unsigned int token = hvmemul_cache_disable(v);
         const struct msi_desc *desc;
         uint32_t data;
 
@@ -621,6 +623,8 @@ void msix_write_completion(struct vcpu *
                                       sizeof(data)) == HVMTRANS_okay &&
              !(data & PCI_MSIX_VECTOR_BITMASK) )
             ctrl_address = snoop_addr;
+
+        hvmemul_cache_restore(v, token);
     }
 
     if ( !ctrl_address )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4347,6 +4347,8 @@ bool vmx_vmenter_helper(const struct cpu
     struct hvm_vcpu_asid *p_asid;
     bool_t need_flush;
 
+    ASSERT(hvmemul_cache_disabled(curr));
+
     /* Shadow EPTP can't be updated here because irqs are disabled */
      if ( nestedhvm_vcpu_in_guestmode(curr) && vcpu_nestedhvm(curr).stale_np2m )
          return false;
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -13,6 +13,7 @@
 #define __ASM_X86_HVM_EMULATE_H__
 
 #include <xen/err.h>
+#include <xen/sched.h>
 #include <asm/hvm/hvm.h>
 #include <asm/x86_emulate.h>
 
@@ -96,6 +97,31 @@ int hvmemul_do_pio_buffer(uint16_t port,
                           uint8_t dir,
                           void *buffer);
 
+#ifdef CONFIG_HVM
+int __must_check hvmemul_cache_init(struct vcpu *v);
+static inline void hvmemul_cache_destroy(struct vcpu *v)
+{
+    XFREE(v->arch.hvm.hvm_io.cache);
+}
+bool hvmemul_read_cache(const struct vcpu *, paddr_t gpa,
+                        void *buffer, unsigned int size);
+void hvmemul_write_cache(const struct vcpu *, paddr_t gpa,
+                         const void *buffer, unsigned int size);
+unsigned int hvmemul_cache_disable(struct vcpu *);
+void hvmemul_cache_restore(struct vcpu *, unsigned int token);
+/* For use in ASSERT()s only: */
+static inline bool hvmemul_cache_disabled(struct vcpu *v)
+{
+    return hvmemul_cache_disable(v) == hvmemul_cache_disable(v);
+}
+#else
+static inline bool hvmemul_read_cache(const struct vcpu *v, paddr_t gpa,
+                                      void *buf,
+                                      unsigned int size) { return false; }
+static inline void hvmemul_write_cache(const struct vcpu *v, paddr_t gpa,
+                                       const void *buf, unsigned int size) {}
+#endif
+
 void hvm_dump_emulation_state(const char *loglvl, const char *prefix,
                               struct hvm_emulate_ctxt *hvmemul_ctxt, int rc);
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -77,6 +77,8 @@ struct hvm_vcpu_io {
     /* For retries we shouldn't re-fetch the instruction. */
     unsigned int mmio_insn_bytes;
     unsigned char mmio_insn[16];
+    struct hvmemul_cache *cache;
+
     /*
      * For string instruction emulation we need to be able to signal a
      * necessary retry through other than function return codes.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 4/7] x86/HVM: implement memory read caching for insn emulation
Posted by Andrew Cooper 4 years, 2 months ago
On 31/01/2020 16:44, Jan Beulich wrote:
> Emulation requiring device model assistance uses a form of instruction
> re-execution, assuming that the second (and any further) pass takes
> exactly the same path. This is a valid assumption as far as use of CPU
> registers goes (as those can't change without any other instruction
> executing in between), but is wrong for memory accesses.

This statement isn't quite accurate.  Various hypercalls and IPIs can
play with GPR state in the middle of emulation.

What matters is that the GPR values aren't expected to change, and a
guest gets to keep all the pieces if they try to play games in this area.

>  In particular
> it has been observed that Windows might page out buffers underneath an
> instruction currently under emulation (hitting between two passes). If
> the first pass read a memory operand successfully, any subsequent pass
> needs to get to see the exact same value.
>
> Introduce a cache to make sure above described assumption holds. This
> is a very simplistic implementation for now: Only exact matches are
> satisfied (no overlaps or partial reads or anything); this is sufficient
> for the immediate purpose of making re-execution an exact replay. The
> cache also won't be used just yet for guest page walks; that'll be the
> subject of a subsequent change.
>
> With the cache being generally transparent to upper layers, but with it
> having limited capacity yet being required for correctness, certain
> users of hvm_copy_from_guest_*() need to disable caching temporarily,
> without invalidating the cache. Note that the adjustments here to
> hvm_hypercall() and hvm_task_switch() are benign at this point; they'll
> become relevant once we start to be able to emulate respective insns
> through the main emulator (and more changes will then likely be needed
> to nested code).
>
> As to the actual data page in this scenario, there are a couple of
> aspects to take into consideration:
> - We must be talking about an insn accessing two locations (two memory
>   ones, one of which is MMIO, or a memory and an I/O one).

Really?

We're talking about all instructions, even without any memory operands,
because at the very minimum we cache the instruction stream, and (with
the following patch), the pagewalk to it.

> - If the non I/O / MMIO side is being read, the re-read (if it occurs at
>   all) is having its result discarded, by taking the shortcut through
>   the first switch()'s STATE_IORESP_READY case in hvmemul_do_io(). Note
>   how, among all the re-issue sanity checks there, we avoid comparing
>   the actual data.
> - If the non I/O / MMIO side is being written, it is the OSes
>   responsibility to avoid actually moving page contents to disk while
>   there might still be a write access in flight - this is no different
>   in behavior from bare hardware.
> - Read-modify-write accesses are, as always, complicated, and while we
>   deal with them better nowadays than we did in the past, we're still
>   not quite there to guarantee hardware like behavior in all cases
>   anyway. Nothing is getting worse by the changes made here, afaict.
>
> In __hvm_copy() also reduce p's scope and change its type to void *.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TBD: In principle the caching here yields unnecessary the one used for
>      insn bytes (vio->mmio_insn{,_bytes}. However, to seed the cache
>      with the data SVM may have made available, we'd have to also know
>      the corresponding GPA. It's not safe, however, to re-walk the page
>      tables to find out, as the page tables may have changed in the
>      meantime. Therefore I guess we need to keep the duplicate
>      functionality for now. A possible solution to this could be to use
>      a physical-address-based cache for page table accesses (and looking
>      forward also e.g. SVM/VMX insn emulation), and a linear-address-
>      based one for all other reads.

Splitting caching like that will re-introduce the same bugs I pointed
out in earlier revisions of this series.  It is not correct to have
multiple (and therefore, non-coherent) caches of memory.

The AMD instruction stream bytes support is by no means perfect either. 
It doesn't function properly when SMAP is active (Erratum #1096), which
is actually caused by the instruction stream being re-fetched at vmexit
time, with a data side access (hence the interaction with SMAP).

Just as with the emulation for non-Nrips hardware, the contents of the
instruction buffer (if valid) need cross referencing with the vmexit
reason to spot races, and the same checking would make it safe to
deduplicate the caching.

> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -28,6 +28,17 @@
>  #include <asm/iocap.h>
>  #include <asm/vm_event.h>
>  
> +struct hvmemul_cache
> +{

This needs a comment (see below.)

> +    unsigned int num_ents;
> +    unsigned int max_ents;
> +    struct {
> +        paddr_t gpa:PADDR_BITS;
> +        unsigned int size:BITS_PER_LONG - PADDR_BITS;

This would probably result in rather better code if size was 8 bits
rather than 12.

Longer term, the size field should disappear and functionality adjusted
to always read aligned 8 (or larger) byte values.  That way, the cache
behaves much more like caches in real processors.

> @@ -2838,6 +2868,123 @@ void hvm_dump_emulation_state(const char
>             hvmemul_ctxt->insn_buf);
>  }
>  
> +int hvmemul_cache_init(struct vcpu *v)
> +{
> +    /*
> +     * No insn can access more than 16 independent linear addresses (AVX512F
> +     * scatters/gathers being the worst).

It is at least 5 more than the worse case number of regular operands, to
cover exceptions, and therefore accesses into the IDT/GDT/LDT and TR and
stack.

You can manage that with a 32bit guest with an exception, %cs and %ss in
different tables, and having to cross a page boundary when pushing the
esp0 exception frame.

It is one fewer generally in 64bit mode, because there are no %ss
references to deal with, but both cases get even more complicated for a
contributory exception, as the second %cs can be different to %cs for
the first exception.

>  Each such linear range can span a
> +     * page boundary, i.e. may require two page walks. Account for each insn
> +     * byte individually.

Why are we counting 5 entries for every instruction byte?  Given an
8-byte maximum cached size, we need at most 2 entries to cover a single
instruction.

> +     */
> +    const unsigned int nents = (CONFIG_PAGING_LEVELS + 1) *
> +                               (MAX_INST_LEN + 16 * 2);
> +    struct hvmemul_cache *cache = xmalloc_flex_struct(struct hvmemul_cache,
> +                                                      ents, nents);
> +
> +    if ( !cache )
> +        return -ENOMEM;
> +
> +    /* Cache is disabled initially. */
> +    cache->num_ents = nents + 1;
> +    cache->max_ents = nents;

(Below:) You need to document the internal semantics (probably best in
the struct definition.)

I thought I'd reverse engineered how disabling works (starting from the
completely obscure hvmemul_cache_disabled()), but given that the cache
is apparently disabled by default, I now can't figure out how it ever
gets enabled.

I can't see any code path where num_ents ends up lower than max_ents,
and the read/write helpers below don't take their early exit path.

> +bool hvmemul_read_cache(const struct vcpu *v, paddr_t gpa,
> +                        void *buffer, unsigned int size)
> +{
> +    const struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
> +    unsigned int i;
> +
> +    /* Cache unavailable? */
> +    if ( cache->num_ents > cache->max_ents )
> +        return false;
> +
> +    while ( size > sizeof(cache->ents->data) )
> +    {
> +        i = gpa & (sizeof(cache->ents->data) - 1)
> +            ? -gpa & (sizeof(cache->ents->data) - 1)
> +            : sizeof(cache->ents->data);
> +        if ( !hvmemul_read_cache(v, gpa, buffer, i) )
> +            return false;

What is this call trying to achieve?

> +        gpa += i;
> +        buffer += i;
> +        size -= i;
> +    }
> +
> +    for ( i = 0; i < cache->num_ents; ++i )
> +        if ( cache->ents[i].gpa == gpa && cache->ents[i].size == size )

With num_ents currently topping out at 235 (and needing to increase
anyway), this can end up being a very long loop.

Given that it will be populated with higher level PTEs to being with,
the common case of accessing real data will be at the end of the O(n)
search rather than the start.

The cache is a single block of memory, so it is almost certainly better
to keep it sorted.  That said, it is problematic due to the overlap of
sizes, which I'm still concerned concerned about generally.

> --- a/xen/include/asm-x86/hvm/emulate.h
> +++ b/xen/include/asm-x86/hvm/emulate.h
> @@ -13,6 +13,7 @@
>  #define __ASM_X86_HVM_EMULATE_H__
>  
>  #include <xen/err.h>
> +#include <xen/sched.h>
>  #include <asm/hvm/hvm.h>
>  #include <asm/x86_emulate.h>
>  
> @@ -96,6 +97,31 @@ int hvmemul_do_pio_buffer(uint16_t port,
>                            uint8_t dir,
>                            void *buffer);
>  
> +#ifdef CONFIG_HVM

This needs a comment block stating, at a minimum, the semantics and
expected use for the cache.

Otherwise, this is ~150 lines of totally undocumented and practically
uncommented, critical and subtle infrastructure.

~Andrew

> +int __must_check hvmemul_cache_init(struct vcpu *v);
> +static inline void hvmemul_cache_destroy(struct vcpu *v)
> +{
> +    XFREE(v->arch.hvm.hvm_io.cache);
> +}
> +bool hvmemul_read_cache(const struct vcpu *, paddr_t gpa,
> +                        void *buffer, unsigned int size);
> +void hvmemul_write_cache(const struct vcpu *, paddr_t gpa,
> +                         const void *buffer, unsigned int size);
> +unsigned int hvmemul_cache_disable(struct vcpu *);
> +void hvmemul_cache_restore(struct vcpu *, unsigned int token);
> +/* For use in ASSERT()s only: */
> +static inline bool hvmemul_cache_disabled(struct vcpu *v)
> +{
> +    return hvmemul_cache_disable(v) == hvmemul_cache_disable(v);
> +}
> +#else
> +static inline bool hvmemul_read_cache(const struct vcpu *v, paddr_t gpa,
> +                                      void *buf,
> +                                      unsigned int size) { return false; }
> +static inline void hvmemul_write_cache(const struct vcpu *v, paddr_t gpa,
> +                                       const void *buf, unsigned int size) {}
> +#endif
> +
>  void hvm_dump_emulation_state(const char *loglvl, const char *prefix,
>                                struct hvm_emulate_ctxt *hvmemul_ctxt, int rc);
>  

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 4/7] x86/HVM: implement memory read caching for insn emulation
Posted by Jan Beulich 4 years, 2 months ago
On 03.02.2020 20:48, Andrew Cooper wrote:
> On 31/01/2020 16:44, Jan Beulich wrote:
>> TBD: In principle the caching here yields unnecessary the one used for
>>      insn bytes (vio->mmio_insn{,_bytes}. However, to seed the cache
>>      with the data SVM may have made available, we'd have to also know
>>      the corresponding GPA. It's not safe, however, to re-walk the page
>>      tables to find out, as the page tables may have changed in the
>>      meantime. Therefore I guess we need to keep the duplicate
>>      functionality for now. A possible solution to this could be to use
>>      a physical-address-based cache for page table accesses (and looking
>>      forward also e.g. SVM/VMX insn emulation), and a linear-address-
>>      based one for all other reads.
> 
> Splitting caching like that will re-introduce the same bugs I pointed
> out in earlier revisions of this series.  It is not correct to have
> multiple (and therefore, non-coherent) caches of memory.
> 
> The AMD instruction stream bytes support is by no means perfect either. 
> It doesn't function properly when SMAP is active (Erratum #1096), which
> is actually caused by the instruction stream being re-fetched at vmexit
> time, with a data side access (hence the interaction with SMAP).

I've looked into this some more, and I'm afraid the text is too
ambiguous to draw conclusions as to possible actions on our
part. It mentions a "GuestInstrBytes field of the VMCB", but the
PM doesn't use such naming afaics. Hence it's not clear whether
the entire 16-byte block is meant, or just the high 120 bits of
it. In the former case we're fine, but in the latter case we'd
mistakenly use the zeros and interpret them as "add %al, (%rax)"
(in 64-bit guest mode, different memory address in others), and
hence we'd need to suppress setting or use of
v->arch.hvm.svm.cached_insn_len. (The used wording comes closer
to the latter case, with the VMCB field at 0xD0 described as
"Number of bytes fetched" and then "Guest instruction bytes".)

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 4/7] x86/HVM: implement memory read caching for insn emulation
Posted by Jan Beulich 4 years, 2 months ago
On 03.02.2020 20:48, Andrew Cooper wrote:
> On 31/01/2020 16:44, Jan Beulich wrote:
>> Emulation requiring device model assistance uses a form of instruction
>> re-execution, assuming that the second (and any further) pass takes
>> exactly the same path. This is a valid assumption as far as use of CPU
>> registers goes (as those can't change without any other instruction
>> executing in between), but is wrong for memory accesses.
> 
> This statement isn't quite accurate.  Various hypercalls and IPIs can
> play with GPR state in the middle of emulation.
> 
> What matters is that the GPR values aren't expected to change, and a
> guest gets to keep all the pieces if they try to play games in this area.

Not just GPR values. I've added a footnote:

[1] Other than on actual hardware, actions like
    XEN_DOMCTL_sethvmcontext, XEN_DOMCTL_setvcpucontext,
    XEN_DOMCTL_set_ext_vcpucontext, VCPUOP_initialise, INIT, or SIPI
    issued against the vCPU can occur while the vCPU is blocked waiting
    for a device model to return data. This won't affect stability of
    Xen, though, it may just lead to guest misbehavior.

I'll see whether I can come up with a reasonable way to deal with the
situation. If so, I may insert another patch ahead of this on in v5.
But I don't think this is an urgent issue to address.

>> As to the actual data page in this scenario, there are a couple of
>> aspects to take into consideration:
>> - We must be talking about an insn accessing two locations (two memory
>>   ones, one of which is MMIO, or a memory and an I/O one).
> 
> Really?
> 
> We're talking about all instructions, even without any memory operands,
> because at the very minimum we cache the instruction stream, and (with
> the following patch), the pagewalk to it.

I'm describing the problem case here that wants fixing, not the general
one. I.e. the case where without this change re-execution might go wrong
after the device model has returned data.

>> TBD: In principle the caching here yields unnecessary the one used for
>>      insn bytes (vio->mmio_insn{,_bytes}. However, to seed the cache
>>      with the data SVM may have made available, we'd have to also know
>>      the corresponding GPA. It's not safe, however, to re-walk the page
>>      tables to find out, as the page tables may have changed in the
>>      meantime. Therefore I guess we need to keep the duplicate
>>      functionality for now. A possible solution to this could be to use
>>      a physical-address-based cache for page table accesses (and looking
>>      forward also e.g. SVM/VMX insn emulation), and a linear-address-
>>      based one for all other reads.
> 
> Splitting caching like that will re-introduce the same bugs I pointed
> out in earlier revisions of this series.  It is not correct to have
> multiple (and therefore, non-coherent) caches of memory.

Well, I continue to disagree, and I can only point you back at the
example of insns with multiple explicit memory references actually
picking up changes done by a remote agent. E.g. REP CMPSB with
rSI == rDI can be observed to terminate with ZF clear and rCX non-
zero. I can only re-iterate that what I introduce here is not a
cache in the sense the CPU caches work (it is now coming closer
than in prior versions, but there are differences which I think
will need to remain - see further down). Instead it is one in the
sense of "secret store for intermediate data". There is no
question of coherency whatsoever for the purposes here: A datum
once read can't become stale, no matter what writes (and by whom)
to the same memory location might occur.

> The AMD instruction stream bytes support is by no means perfect either. 
> It doesn't function properly when SMAP is active (Erratum #1096), which
> is actually caused by the instruction stream being re-fetched at vmexit
> time, with a data side access (hence the interaction with SMAP).
> 
> Just as with the emulation for non-Nrips hardware, the contents of the
> instruction buffer (if valid) need cross referencing with the vmexit
> reason to spot races, and the same checking would make it safe to
> deduplicate the caching.

Right, and we've put in place some basic sanity checks for this
purpose, but no, I'm not convinced this is good enough to allow
safely dropping the duplicate caching. But anyway, this isn't
anything to be done in this patch, but only - if at all - in a
follow-on one.

>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -28,6 +28,17 @@
>>  #include <asm/iocap.h>
>>  #include <asm/vm_event.h>
>>  
>> +struct hvmemul_cache
>> +{
> 
> This needs a comment (see below.)

I don't think I've spotted which of your further remarks this refers
to, and hence what it is a comment here would need to say.

>> +    unsigned int num_ents;
>> +    unsigned int max_ents;
>> +    struct {
>> +        paddr_t gpa:PADDR_BITS;
>> +        unsigned int size:BITS_PER_LONG - PADDR_BITS;
> 
> This would probably result in rather better code if size was 8 bits
> rather than 12.

I can make it so, by inserting a BITS_PER_LONG - PADDR_BITS - 8
unnamed field. I agree that this way no shift is going to be
needed.

> Longer term, the size field should disappear and functionality adjusted
> to always read aligned 8 (or larger) byte values.  That way, the cache
> behaves much more like caches in real processors.

As per above, I don't think this can or should be the goal. Not
the least because this cache (as said, as in "secret store") is
also used to "cache" uncacheable data. In such cases we may not
read more than what was asked.

>> @@ -2838,6 +2868,123 @@ void hvm_dump_emulation_state(const char
>>             hvmemul_ctxt->insn_buf);
>>  }
>>  
>> +int hvmemul_cache_init(struct vcpu *v)
>> +{
>> +    /*
>> +     * No insn can access more than 16 independent linear addresses (AVX512F
>> +     * scatters/gathers being the worst).
> 
> It is at least 5 more than the worse case number of regular operands, to
> cover exceptions, and therefore accesses into the IDT/GDT/LDT and TR and
> stack.

The emulator doesn't actually emulate exception delivery (yet?),
so no such memory accesses would result. I'd also not call this
accesses the insn itself causes (other then e.g. a descriptor
table access by a segment register load).

> You can manage that with a 32bit guest with an exception, %cs and %ss in
> different tables, and having to cross a page boundary when pushing the
> esp0 exception frame.
> 
> It is one fewer generally in 64bit mode, because there are no %ss
> references to deal with, but both cases get even more complicated for a
> contributory exception, as the second %cs can be different to %cs for
> the first exception.
> 
>>  Each such linear range can span a
>> +     * page boundary, i.e. may require two page walks. Account for each insn
>> +     * byte individually.
> 
> Why are we counting 5 entries for every instruction byte?  Given an
> 8-byte maximum cached size, we need at most 2 entries to cover a single
> instruction.

3 entries if the insn spans a page boundary, plus the two
corresponding page walks. But: While the page walk data would
be shared for the individual, perhaps as small as byte, reads,
the fetched data may consume as many slots as there are
separate fetch requests. Recall that reads of adjacent
addresses don't get folded, even if the overall range would
fit. I can of course make the expression here more complicated,
to express MAX_INST_LEN actual data slots plus two sets of
page walk ones. Alternatively I can append "for simplicity" to
the comment, if that helps. Just let me know.

>> +     */
>> +    const unsigned int nents = (CONFIG_PAGING_LEVELS + 1) *
>> +                               (MAX_INST_LEN + 16 * 2);
>> +    struct hvmemul_cache *cache = xmalloc_flex_struct(struct hvmemul_cache,
>> +                                                      ents, nents);
>> +
>> +    if ( !cache )
>> +        return -ENOMEM;
>> +
>> +    /* Cache is disabled initially. */
>> +    cache->num_ents = nents + 1;
>> +    cache->max_ents = nents;
> 
> (Below:) You need to document the internal semantics (probably best in
> the struct definition.)

I've added "The cache is disabled as long as num_ents > max_ents." Is
there anything else I should add?

> I thought I'd reverse engineered how disabling works (starting from the
> completely obscure hvmemul_cache_disabled()),

If it's "completely obscure" - any suggestions how to improve the
situation?

> but given that the cache
> is apparently disabled by default, I now can't figure out how it ever
> gets enabled.
> 
> I can't see any code path where num_ents ends up lower than max_ents,
> and the read/write helpers below don't take their early exit path.

Check the hunk changing the top of _hvm_emulate_one().

>> +bool hvmemul_read_cache(const struct vcpu *v, paddr_t gpa,
>> +                        void *buffer, unsigned int size)
>> +{
>> +    const struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
>> +    unsigned int i;
>> +
>> +    /* Cache unavailable? */
>> +    if ( cache->num_ents > cache->max_ents )
>> +        return false;
>> +
>> +    while ( size > sizeof(cache->ents->data) )
>> +    {
>> +        i = gpa & (sizeof(cache->ents->data) - 1)
>> +            ? -gpa & (sizeof(cache->ents->data) - 1)
>> +            : sizeof(cache->ents->data);
>> +        if ( !hvmemul_read_cache(v, gpa, buffer, i) )
>> +            return false;
> 
> What is this call trying to achieve?

When the full request won't fit a single entry, this reads
the first so many bytes to make gpa 8-byte aligned if it
isn't already, or a single entry's worth of data otherwise.

>> +        gpa += i;
>> +        buffer += i;
>> +        size -= i;
>> +    }
>> +
>> +    for ( i = 0; i < cache->num_ents; ++i )
>> +        if ( cache->ents[i].gpa == gpa && cache->ents[i].size == size )
> 
> With num_ents currently topping out at 235 (and needing to increase
> anyway), this can end up being a very long loop.
> 
> Given that it will be populated with higher level PTEs to being with,
> the common case of accessing real data will be at the end of the O(n)
> search rather than the start.

Any "real data" access will also be accompanied by a page walk.
I therefore don't see how re-arrangement would help. Also
num_ents is hardly ever going to be more than a dozen or so,
unless we really need to emulate something exotic like an S/G
insn.

> The cache is a single block of memory, so it is almost certainly better
> to keep it sorted.  That said, it is problematic due to the overlap of
> sizes, which I'm still concerned concerned about generally.

Well, I thought I'd start with something really simple. As per
above the "overlap" of sizes is not a problem, since each
individual read issued may be considered to observe memory
state at the time of its issuing (rather than that at the time
when a prior overlapping read was issued). Merging would _also_
be okay, but is not required for correctness.

The question is whether the higher cost of insertion (when
wanting to make the cache sorted) is outweighed by the cheaper
lookup. The common case is for there to not be re-execution:
Neither internally handled MMIO need it, nor writes sent to
the DM, nor almost anything that's introspection related.

One option I've been considering is to actually "sequence-
number" the accesses. Upon replay the same sequence of
operations is to occur anyway, which would make an actual
lookup (as in "search") of the correct entry unnecessary. Of
course this would move us further away again from how actual
CPU caches work. I could also imagine a hybrid model, where
the last read slot is recorded, and the next read would look
at the subsequent slot first.

>> --- a/xen/include/asm-x86/hvm/emulate.h
>> +++ b/xen/include/asm-x86/hvm/emulate.h
>> @@ -13,6 +13,7 @@
>>  #define __ASM_X86_HVM_EMULATE_H__
>>  
>>  #include <xen/err.h>
>> +#include <xen/sched.h>
>>  #include <asm/hvm/hvm.h>
>>  #include <asm/x86_emulate.h>
>>  
>> @@ -96,6 +97,31 @@ int hvmemul_do_pio_buffer(uint16_t port,
>>                            uint8_t dir,
>>                            void *buffer);
>>  
>> +#ifdef CONFIG_HVM
> 
> This needs a comment block stating, at a minimum, the semantics and
> expected use for the cache.
> 
> Otherwise, this is ~150 lines of totally undocumented and practically
> uncommented, critical and subtle infrastructure.

Will do; let's see if what I can come up with is good enough.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 5/7] x86/mm: use cache in guest_walk_tables()
Posted by Jan Beulich 4 years, 2 months ago
Emulation requiring device model assistance uses a form of instruction
re-execution, assuming that the second (and any further) pass takes
exactly the same path. This is a valid assumption as far as use of CPU
registers goes (as those can't change without any other instruction
executing in between), but is wrong for memory accesses. In particular
it has been observed that Windows might page out buffers underneath an
instruction currently under emulation (hitting between two passes). If
the first pass translated a linear address successfully, any subsequent
pass needs to do so too, yielding the exact same translation. To
guarantee this, leverage the caching that now backs HVM insn emulation.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: Adjust for cache now (elsewhere) being transparent to callers.
    Provide inline stubs for the !HVM case.
v2: Don't wrongly use top_gfn for non-root gpa calculation. Re-write
    cache entries after setting A/D bits (an alternative would be to
    suppress their setting upon cache hits).

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -2918,7 +2918,7 @@ bool hvmemul_read_cache(const struct vcp
     unsigned int i;
 
     /* Cache unavailable? */
-    if ( cache->num_ents > cache->max_ents )
+    if ( !is_hvm_vcpu(v) || cache->num_ents > cache->max_ents )
         return false;
 
     while ( size > sizeof(cache->ents->data) )
@@ -2950,7 +2950,7 @@ void hvmemul_write_cache(const struct vc
     unsigned int i;
 
     /* Cache unavailable? */
-    if ( cache->num_ents > cache->max_ents )
+    if ( !is_hvm_vcpu(v) || cache->num_ents > cache->max_ents )
         return;
 
     while ( size > sizeof(cache->ents->data) )
--- a/xen/arch/x86/mm/guest_walk.c
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -31,6 +31,7 @@ asm(".file \"" __OBJECT_FILE__ "\"");
 #include <xen/sched.h>
 #include <asm/page.h>
 #include <asm/guest_pt.h>
+#include <asm/hvm/emulate.h>
 
 /*
  * Modify a guest pagetable entry to set the Accessed and Dirty bits.
@@ -80,9 +81,9 @@ static bool set_ad_bits(guest_intpte_t *
  * requested walk, to see whether the access is permitted.
  */
 bool
-guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
-                  unsigned long va, walk_t *gw,
-                  uint32_t walk, mfn_t top_mfn, void *top_map)
+guest_walk_tables(const struct vcpu *v, struct p2m_domain *p2m,
+                  unsigned long va, walk_t *gw, uint32_t walk,
+                  gfn_t top_gfn, mfn_t top_mfn, void *top_map)
 {
     struct domain *d = v->domain;
     p2m_type_t p2mt;
@@ -91,8 +92,13 @@ guest_walk_tables(struct vcpu *v, struct
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
     guest_l3e_t *l3p = NULL;
     guest_l4e_t *l4p;
+    paddr_t l4gpa;
+#endif
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+    paddr_t l3gpa;
 #endif
     uint32_t gflags, rc;
+    paddr_t l1gpa = 0, l2gpa = 0;
     unsigned int leaf_level;
     p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE;
 
@@ -133,7 +139,13 @@ guest_walk_tables(struct vcpu *v, struct
     /* Get the l4e from the top level table and check its flags*/
     gw->l4mfn = top_mfn;
     l4p = (guest_l4e_t *) top_map;
-    gw->l4e = l4p[guest_l4_table_offset(va)];
+    l4gpa = gfn_to_gaddr(top_gfn) +
+            guest_l4_table_offset(va) * sizeof(gw->l4e);
+    if ( !hvmemul_read_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e)) )
+    {
+        gw->l4e = l4p[guest_l4_table_offset(va)];
+        hvmemul_write_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e));
+    }
     gflags = guest_l4e_get_flags(gw->l4e);
     if ( !(gflags & _PAGE_PRESENT) )
         goto out;
@@ -163,7 +175,13 @@ guest_walk_tables(struct vcpu *v, struct
     }
 
     /* Get the l3e and check its flags*/
-    gw->l3e = l3p[guest_l3_table_offset(va)];
+    l3gpa = gfn_to_gaddr(guest_l4e_get_gfn(gw->l4e)) +
+            guest_l3_table_offset(va) * sizeof(gw->l3e);
+    if ( !hvmemul_read_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e)) )
+    {
+        gw->l3e = l3p[guest_l3_table_offset(va)];
+        hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
+    }
     gflags = guest_l3e_get_flags(gw->l3e);
     if ( !(gflags & _PAGE_PRESENT) )
         goto out;
@@ -215,7 +233,14 @@ guest_walk_tables(struct vcpu *v, struct
 #else /* PAE only... */
 
     /* Get the l3e and check its flag */
-    gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
+    l3gpa = gfn_to_gaddr(top_gfn) + ((unsigned long)top_map & ~PAGE_MASK) +
+            guest_l3_table_offset(va) * sizeof(gw->l3e);
+    if ( !hvmemul_read_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e)) )
+    {
+        gw->l3e = ((guest_l3e_t *)top_map)[guest_l3_table_offset(va)];
+        hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
+    }
+
     gflags = guest_l3e_get_flags(gw->l3e);
     if ( !(gflags & _PAGE_PRESENT) )
         goto out;
@@ -241,18 +266,24 @@ guest_walk_tables(struct vcpu *v, struct
         goto out;
     }
 
-    /* Get the l2e */
-    gw->l2e = l2p[guest_l2_table_offset(va)];
+    l2gpa = gfn_to_gaddr(guest_l3e_get_gfn(gw->l3e));
 
 #else /* 32-bit only... */
 
-    /* Get l2e from the top level table */
     gw->l2mfn = top_mfn;
     l2p = (guest_l2e_t *) top_map;
-    gw->l2e = l2p[guest_l2_table_offset(va)];
+    l2gpa = gfn_to_gaddr(top_gfn);
 
 #endif /* All levels... */
 
+    /* Get the l2e */
+    l2gpa += guest_l2_table_offset(va) * sizeof(gw->l2e);
+    if ( !hvmemul_read_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e)) )
+    {
+        gw->l2e = l2p[guest_l2_table_offset(va)];
+        hvmemul_write_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e));
+    }
+
     /* Check the l2e flags. */
     gflags = guest_l2e_get_flags(gw->l2e);
     if ( !(gflags & _PAGE_PRESENT) )
@@ -334,7 +365,15 @@ guest_walk_tables(struct vcpu *v, struct
         gw->pfec |= rc & PFEC_synth_mask;
         goto out;
     }
-    gw->l1e = l1p[guest_l1_table_offset(va)];
+
+    l1gpa = gfn_to_gaddr(guest_l2e_get_gfn(gw->l2e)) +
+            guest_l1_table_offset(va) * sizeof(gw->l1e);
+    if ( !hvmemul_read_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e)) )
+    {
+        gw->l1e = l1p[guest_l1_table_offset(va)];
+        hvmemul_write_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e));
+    }
+
     gflags = guest_l1e_get_flags(gw->l1e);
     if ( !(gflags & _PAGE_PRESENT) )
         goto out;
@@ -445,22 +484,34 @@ guest_walk_tables(struct vcpu *v, struct
     case 1:
         if ( set_ad_bits(&l1p[guest_l1_table_offset(va)].l1, &gw->l1e.l1,
                          (walk & PFEC_write_access)) )
+        {
             paging_mark_dirty(d, gw->l1mfn);
+            hvmemul_write_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e));
+        }
         /* Fallthrough */
     case 2:
         if ( set_ad_bits(&l2p[guest_l2_table_offset(va)].l2, &gw->l2e.l2,
                          (walk & PFEC_write_access) && leaf_level == 2) )
+        {
             paging_mark_dirty(d, gw->l2mfn);
+            hvmemul_write_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e));
+        }
         /* Fallthrough */
 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
     case 3:
         if ( set_ad_bits(&l3p[guest_l3_table_offset(va)].l3, &gw->l3e.l3,
                          (walk & PFEC_write_access) && leaf_level == 3) )
+        {
             paging_mark_dirty(d, gw->l3mfn);
+            hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
+        }
 
         if ( set_ad_bits(&l4p[guest_l4_table_offset(va)].l4, &gw->l4e.l4,
                          false) )
+        {
             paging_mark_dirty(d, gw->l4mfn);
+            hvmemul_write_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e));
+        }
 #endif
     }
 
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -91,7 +91,8 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PA
 #if GUEST_PAGING_LEVELS == 3
     top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
-    walk_ok = guest_walk_tables(v, p2m, ga, &gw, *pfec, top_mfn, top_map);
+    walk_ok = guest_walk_tables(v, p2m, ga, &gw, *pfec,
+                                top_gfn, top_mfn, top_map);
     unmap_domain_page(top_map);
     put_page(top_page);
 
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -175,9 +175,13 @@ static inline bool
 sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                      uint32_t pfec)
 {
+    gfn_t root_gfn = _gfn(paging_mode_external(v->domain)
+                          ? cr3_pa(v->arch.hvm.guest_cr[3]) >> PAGE_SHIFT
+                          : pagetable_get_pfn(v->arch.guest_table));
+
 #if GUEST_PAGING_LEVELS == 3 /* PAE */
     return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec,
-                             INVALID_MFN, v->arch.paging.shadow.gl3e);
+                             root_gfn, INVALID_MFN, v->arch.paging.shadow.gl3e);
 #else /* 32 or 64 */
     const struct domain *d = v->domain;
     mfn_t root_mfn = (v->arch.flags & TF_kernel_mode
@@ -185,7 +189,7 @@ sh_walk_guest_tables(struct vcpu *v, uns
                       : pagetable_get_mfn(v->arch.guest_table_user));
     void *root_map = map_domain_page(root_mfn);
     bool ok = guest_walk_tables(v, p2m_get_hostp2m(d), va, gw, pfec,
-                                root_mfn, root_map);
+                                root_gfn, root_mfn, root_map);
 
     unmap_domain_page(root_map);
 
--- a/xen/include/asm-x86/guest_pt.h
+++ b/xen/include/asm-x86/guest_pt.h
@@ -428,8 +428,9 @@ static inline unsigned int guest_walk_to
 #define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS)
 
 bool
-guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, unsigned long va,
-                  walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map);
+guest_walk_tables(const struct vcpu *v, struct p2m_domain *p2m,
+                  unsigned long va, walk_t *gw, uint32_t pfec,
+                  gfn_t top_gfn, mfn_t top_mfn, void *top_map);
 
 /* Pretty-print the contents of a guest-walk */
 static inline void print_gw(const walk_t *gw)

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 6/7] x86/mm: drop p2mt parameter from map_domain_gfn()
Posted by Jan Beulich 4 years, 2 months ago
No caller actually consumes it.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/arch/x86/hvm/dom0_build.c
+++ b/xen/arch/x86/hvm/dom0_build.c
@@ -278,7 +278,6 @@ static int __init pvh_add_mem_range(stru
 
 static int __init pvh_setup_vmx_realmode_helpers(struct domain *d)
 {
-    p2m_type_t p2mt;
     uint32_t rc, *ident_pt;
     mfn_t mfn;
     paddr_t gaddr;
@@ -317,7 +316,7 @@ static int __init pvh_setup_vmx_realmode
      * superpages.
      */
     ident_pt = map_domain_gfn(p2m_get_hostp2m(d), _gfn(PFN_DOWN(gaddr)),
-                              &mfn, &p2mt, 0, &rc);
+                              &mfn, 0, &rc);
     if ( ident_pt == NULL )
     {
         printk("Unable to map identity page tables\n");
--- a/xen/arch/x86/mm/guest_walk.c
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -86,7 +86,6 @@ guest_walk_tables(const struct vcpu *v,
                   gfn_t top_gfn, mfn_t top_mfn, void *top_map)
 {
     struct domain *d = v->domain;
-    p2m_type_t p2mt;
     guest_l1e_t *l1p = NULL;
     guest_l2e_t *l2p = NULL;
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
@@ -165,7 +164,6 @@ guest_walk_tables(const struct vcpu *v,
     l3p = map_domain_gfn(p2m,
                          guest_l4e_get_gfn(gw->l4e),
                          &gw->l3mfn,
-                         &p2mt,
                          qt,
                          &rc);
     if ( l3p == NULL )
@@ -257,7 +255,6 @@ guest_walk_tables(const struct vcpu *v,
     l2p = map_domain_gfn(p2m,
                          guest_l3e_get_gfn(gw->l3e),
                          &gw->l2mfn,
-                         &p2mt,
                          qt,
                          &rc);
     if ( l2p == NULL )
@@ -357,7 +354,6 @@ guest_walk_tables(const struct vcpu *v,
     l1p = map_domain_gfn(p2m,
                          guest_l2e_get_gfn(gw->l2e),
                          &gw->l1mfn,
-                         &p2mt,
                          qt,
                          &rc);
     if ( l1p == NULL )
--- a/xen/arch/x86/mm/hap/nested_ept.c
+++ b/xen/arch/x86/mm/hap/nested_ept.c
@@ -151,7 +151,6 @@ static uint32_t
 nept_walk_tables(struct vcpu *v, unsigned long l2ga, ept_walk_t *gw)
 {
     int lvl;
-    p2m_type_t p2mt;
     uint32_t rc = 0, ret = 0, gflags;
     struct domain *d = v->domain;
     struct p2m_domain *p2m = d->arch.p2m;
@@ -163,7 +162,7 @@ nept_walk_tables(struct vcpu *v, unsigne
 
     for (lvl = 4; lvl > 0; lvl--)
     {
-        lxp = map_domain_gfn(p2m, base_gfn, &lxmfn, &p2mt, P2M_ALLOC, &rc);
+        lxp = map_domain_gfn(p2m, base_gfn, &lxmfn, P2M_ALLOC, &rc);
         if ( !lxp )
             goto map_err;
         gw->lxe[lvl] = lxp[ept_lvl_table_offset(l2ga, lvl)];
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -2214,8 +2214,9 @@ unsigned long paging_gva_to_gfn(struct v
  * synthetic/structure PFEC_* bits.
  */
 void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn,
-                     p2m_type_t *p2mt, p2m_query_t q, uint32_t *pfec)
+                     p2m_query_t q, uint32_t *pfec)
 {
+    p2m_type_t p2mt;
     struct page_info *page;
 
     if ( !gfn_valid(p2m->domain, gfn) )
@@ -2225,8 +2226,8 @@ void *map_domain_gfn(struct p2m_domain *
     }
 
     /* Translate the gfn, unsharing if shared. */
-    page = p2m_get_page_from_gfn(p2m, gfn, p2mt, NULL, q);
-    if ( p2m_is_paging(*p2mt) )
+    page = p2m_get_page_from_gfn(p2m, gfn, &p2mt, NULL, q);
+    if ( p2m_is_paging(p2mt) )
     {
         ASSERT(p2m_is_hostp2m(p2m));
         if ( page )
@@ -2235,7 +2236,7 @@ void *map_domain_gfn(struct p2m_domain *
         *pfec = PFEC_page_paged;
         return NULL;
     }
-    if ( p2m_is_shared(*p2mt) )
+    if ( p2m_is_shared(p2mt) )
     {
         if ( page )
             put_page(page);
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -762,7 +762,7 @@ int __must_check p2m_set_entry(struct p2
 extern void p2m_pt_init(struct p2m_domain *p2m);
 
 void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn,
-                     p2m_type_t *p2mt, p2m_query_t q, uint32_t *pfec);
+                     p2m_query_t q, uint32_t *pfec);
 
 /* Debugging and auditing of the P2M code? */
 #ifndef NDEBUG


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 6/7] x86/mm: drop p2mt parameter from map_domain_gfn()
Posted by Andrew Cooper 4 years, 2 months ago
On 31/01/2020 16:45, Jan Beulich wrote:
> No caller actually consumes it.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
[Xen-devel] [PATCH v4 7/7] x86/HVM: reduce scope of pfec in hvm_emulate_init_per_insn()
Posted by Jan Beulich 4 years, 2 months ago
It needs calculating only in one out of three cases. Re-structure the
code a little such that the variable truly gets calculated only when we
don't get any insn bytes from elsewhere, and hence need to (try to)
fetch them. Also OR in PFEC_insn_fetch right in the initializer.

While in this mood, restrict addr's scope as well.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v4: New.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -2762,8 +2762,6 @@ void hvm_emulate_init_per_insn(
     unsigned int insn_bytes)
 {
     struct vcpu *curr = current;
-    unsigned int pfec = PFEC_page_present;
-    unsigned long addr;
 
     hvmemul_ctxt->ctxt.lma = hvm_long_mode_active(curr);
 
@@ -2778,14 +2776,23 @@ void hvm_emulate_init_per_insn(
             hvmemul_ctxt->seg_reg[x86_seg_ss].db ? 32 : 16;
     }
 
-    if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
-        pfec |= PFEC_user_mode;
-
     hvmemul_ctxt->insn_buf_eip = hvmemul_ctxt->ctxt.regs->rip;
-    if ( !insn_bytes )
+
+    if ( insn_bytes )
     {
+        hvmemul_ctxt->insn_buf_bytes = insn_bytes;
+        memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes);
+    }
+    else if ( !(hvmemul_ctxt->insn_buf_bytes =
+                hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf)) )
+    {
+        unsigned int pfec = PFEC_page_present | PFEC_insn_fetch;
+        unsigned long addr;
+
+        if ( hvmemul_ctxt->seg_reg[x86_seg_ss].dpl == 3 )
+            pfec |= PFEC_user_mode;
+
         hvmemul_ctxt->insn_buf_bytes =
-            hvm_get_insn_bytes(curr, hvmemul_ctxt->insn_buf) ?:
             (hvm_virtual_to_linear_addr(x86_seg_cs,
                                         &hvmemul_ctxt->seg_reg[x86_seg_cs],
                                         hvmemul_ctxt->insn_buf_eip,
@@ -2795,15 +2802,9 @@ void hvm_emulate_init_per_insn(
                                         &addr) &&
              hvm_copy_from_guest_linear(hvmemul_ctxt->insn_buf, addr,
                                         sizeof(hvmemul_ctxt->insn_buf),
-                                        pfec | PFEC_insn_fetch,
-                                        NULL) == HVMTRANS_okay) ?
+                                        pfec, NULL) == HVMTRANS_okay) ?
             sizeof(hvmemul_ctxt->insn_buf) : 0;
     }
-    else
-    {
-        hvmemul_ctxt->insn_buf_bytes = insn_bytes;
-        memcpy(hvmemul_ctxt->insn_buf, insn_buf, insn_bytes);
-    }
 }
 
 void hvm_emulate_writeback(


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel
Re: [Xen-devel] [PATCH v4 7/7] x86/HVM: reduce scope of pfec in hvm_emulate_init_per_insn()
Posted by Andrew Cooper 4 years, 2 months ago
On 31/01/2020 16:46, Jan Beulich wrote:
> It needs calculating only in one out of three cases. Re-structure the
> code a little such that the variable truly gets calculated only when we
> don't get any insn bytes from elsewhere, and hence need to (try to)
> fetch them. Also OR in PFEC_insn_fetch right in the initializer.
>
> While in this mood, restrict addr's scope as well.
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>

Acked-by: Andrew Cooper <andrew.cooper3@citrix.com>

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel