[v5] x86/HVM: implement memory read caching

[Xen-devel] [PATCH v5 0/4] x86/HVM: implement memory read caching

Posted by Jan Beulich 5 years, 11 months ago

Emulation requiring device model assistance uses a form of instruction
re-execution, assuming that the second (and any further) pass takes
exactly the same path. This is a valid assumption as far as use of CPU
registers goes (as those can't change without any other instruction
executing in between), but is wrong for memory accesses. In particular
it has been observed that Windows might page out buffers underneath
an instruction currently under emulation (hitting between two passes).
If the first pass translated a linear address successfully, any subsequent
pass needs to do so too, yielding the exact same translation.

Introduce a cache to make sure above described assumption holds. This
is a very simplistic implementation for now: Only exact matches are
satisfied (no overlaps or partial reads or anything).

There's also some perhaps seemingly unrelated cleanup here which was
found desirable on the way - the 3 initial patches are truly prereqs
(at least in a contextual way), while the 2 last ones are just for
things noticed along the way.

1: x86/HVM: cancel emulation when register state got altered
2: x86/HVM: implement memory read caching for insn emulation
3: x86/mm: use cache in guest_walk_tables()
4: x86/HVM: __hvm_copy()'s size parameter is an unsigned quantity

The main difference to v4 are the new first and last patches (with
the latter being largely unrelated cleanup). For other changes see
the individual patches.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 1/4] x86/HVM: cancel emulation when register state got altered

Posted by Jan Beulich 5 years, 11 months ago

Re-execution (after having received data from a device model) relies on
the same register state still being in place as it was when the request
was first sent to the device model. Therefore vCPU state changes
effected by remote sources need to result in no attempt of re-execution.
Instead the returned data is to simply be ignored.

Note that any such asynchronous state changes happen with the vCPU at
least paused (potentially down and/or not marked ->is_initialised), so
there's no issue with fiddling with register state behind the actively
running emulator's back. Hence the new function doesn't need to
synchronize with the core emulation logic.

Suggested-by: Andrew Cooper <andrew.cooper3@citrix.com>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/arch/x86/domctl.c
+++ b/xen/arch/x86/domctl.c
@@ -21,6 +21,7 @@
 #include <xen/iocap.h>
 #include <xen/paging.h>
 #include <asm/irq.h>
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/processor.h>
@@ -1147,11 +1148,16 @@ long arch_do_domctl(
             else
             {
                 vcpu_pause(v);
+
                 v->arch.xcr0 = _xcr0;
                 v->arch.xcr0_accum = _xcr0_accum;
                 v->arch.nonlazy_xstate_used = _xcr0_accum & XSTATE_NONLAZY;
                 compress_xsave_states(v, _xsave_area,
                                       evc->size - PV_XSAVE_HDR_SIZE);
+
+                if ( is_hvm_domain(d) )
+                    hvmemul_cancel(v);
+
                 vcpu_unpause(v);
             }
 
--- a/xen/arch/x86/hvm/domain.c
+++ b/xen/arch/x86/hvm/domain.c
@@ -22,6 +22,8 @@
 #include <xen/paging.h>
 #include <xen/sched.h>
 
+#include <asm/hvm/emulate.h>
+
 #include <public/hvm/hvm_vcpu.h>
 
 static int check_segment(struct segment_register *reg, enum x86_segment seg)
@@ -323,6 +325,8 @@ int arch_set_info_hvm_guest(struct vcpu
 
     paging_update_paging_modes(v);
 
+    hvmemul_cancel(v);
+
     v->is_initialised = 1;
     set_bit(_VPF_down, &v->pause_flags);
 
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -121,6 +121,23 @@ static const struct hvm_io_handler ioreq
     .ops = &ioreq_server_ops
 };
 
+/*
+ * Drop all records of in-flight emulation. This is needed whenever a vCPU's
+ * register state may have changed behind the emulator's back.
+ */
+void hvmemul_cancel(struct vcpu *v)
+{
+    struct hvm_vcpu_io *vio = &v->arch.hvm.hvm_io;
+
+    vio->io_req.state = STATE_IOREQ_NONE;
+    vio->io_completion = HVMIO_no_completion;
+    vio->mmio_cache_count = 0;
+    vio->mmio_insn_bytes = 0;
+    vio->mmio_access = (struct npfec){};
+    vio->mmio_retry = false;
+    vio->g2m_ioport = NULL;
+}
+
 static int hvmemul_do_io(
     bool_t is_mmio, paddr_t addr, unsigned long *reps, unsigned int size,
     uint8_t dir, bool_t df, bool_t data_is_addr, uintptr_t data)
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -477,6 +477,14 @@ u64 hvm_get_guest_tsc_fixed(struct vcpu
     return tsc + v->arch.hvm.cache_tsc_offset;
 }
 
+void hvm_set_info_guest(struct vcpu *v)
+{
+    if ( hvm_funcs.set_info_guest )
+        alternative_vcall(hvm_funcs.set_info_guest, v);
+
+    hvmemul_cancel(v);
+}
+
 void hvm_migrate_timers(struct vcpu *v)
 {
     rtc_migrate_timers(v);
@@ -1162,6 +1170,8 @@ static int hvm_load_cpu_ctxt(struct doma
     v->arch.dr6   = ctxt.dr6;
     v->arch.dr7   = ctxt.dr7;
 
+    hvmemul_cancel(v);
+
     /* Auxiliary processors should be woken immediately. */
     v->is_initialised = 1;
     clear_bit(_VPF_down, &v->pause_flags);
--- a/xen/arch/x86/hvm/vlapic.c
+++ b/xen/arch/x86/hvm/vlapic.c
@@ -33,6 +33,7 @@
 #include <asm/apic.h>
 #include <asm/io_apic.h>
 #include <asm/vpmu.h>
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/io.h>
 #include <asm/hvm/support.h>
@@ -306,6 +307,8 @@ static void vlapic_init_sipi_one(struct
         BUG();
     }
 
+    hvmemul_cancel(target);
+
     vcpu_unpause(target);
 }
 
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -76,6 +76,7 @@ void hvm_emulate_init_per_insn(
     unsigned int insn_bytes);
 void hvm_emulate_writeback(
     struct hvm_emulate_ctxt *hvmemul_ctxt);
+void hvmemul_cancel(struct vcpu *v);
 struct segment_register *hvmemul_get_seg_reg(
     enum x86_segment seg,
     struct hvm_emulate_ctxt *hvmemul_ctxt);
--- a/xen/include/asm-x86/hvm/hvm.h
+++ b/xen/include/asm-x86/hvm/hvm.h
@@ -278,6 +278,8 @@ void hvm_get_segment_register(struct vcp
 void hvm_set_segment_register(struct vcpu *v, enum x86_segment seg,
                               struct segment_register *reg);
 
+void hvm_set_info_guest(struct vcpu *v);
+
 bool hvm_set_guest_bndcfgs(struct vcpu *v, u64 val);
 
 int hvm_vmexit_cpuid(struct cpu_user_regs *regs, unsigned int inst_len);
@@ -546,12 +548,6 @@ static inline unsigned int hvm_get_insn_
             ? alternative_call(hvm_funcs.get_insn_bytes, v, buf) : 0);
 }
 
-static inline void hvm_set_info_guest(struct vcpu *v)
-{
-    if ( hvm_funcs.set_info_guest )
-        alternative_vcall(hvm_funcs.set_info_guest, v);
-}
-
 static inline void hvm_invalidate_regs_fields(struct cpu_user_regs *regs)
 {
 #ifndef NDEBUG
@@ -682,7 +678,6 @@ static inline bool altp2m_vcpu_emulate_v
  */
 int hvm_guest_x86_mode(struct vcpu *v);
 unsigned long hvm_get_shadow_gs_base(struct vcpu *v);
-void hvm_set_info_guest(struct vcpu *v);
 void hvm_cpuid_policy_changed(struct vcpu *v);
 void hvm_set_tsc_offset(struct vcpu *v, uint64_t offset, uint64_t at_tsc);
 bool hvm_get_guest_bndcfgs(struct vcpu *v, uint64_t *val);


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 2/4] x86/HVM: implement memory read caching for insn emulation

Posted by Jan Beulich 5 years, 11 months ago

Emulation requiring device model assistance uses a form of instruction
re-execution, assuming that the second (and any further) pass takes
exactly the same path. This is a valid assumption as far as use of CPU
registers goes (as those can't change without any other instruction
executing in between [1]), but is wrong for memory accesses. In
particular it has been observed that Windows might page out buffers
underneath an instruction currently under emulation (hitting between two
passes). If the first pass read a memory operand successfully, any
subsequent pass needs to get to see the exact same value.

Introduce a cache to make sure above described assumption holds. This
is a very simplistic implementation for now: Only exact matches are
satisfied (no overlaps or partial reads or anything); this is sufficient
for the immediate purpose of making re-execution an exact replay. The
cache also won't be used just yet for guest page walks; that'll be the
subject of a subsequent change.

With the cache being generally transparent to upper layers, but with it
having limited capacity yet being required for correctness, certain
users of hvm_copy_from_guest_*() need to disable caching temporarily,
without invalidating the cache. Note that the adjustments here to
hvm_hypercall() and hvm_task_switch() are benign at this point; they'll
become relevant once we start to be able to emulate respective insns
through the main emulator (and more changes will then likely be needed
to nested code).

As to the actual data page in a problamtic scenario, there are a couple
of aspects to take into consideration:
- We must be talking about an insn accessing two locations (two memory
  ones, one of which is MMIO, or a memory and an I/O one).
- If the non I/O / MMIO side is being read, the re-read (if it occurs at
  all) is having its result discarded, by taking the shortcut through
  the first switch()'s STATE_IORESP_READY case in hvmemul_do_io(). Note
  how, among all the re-issue sanity checks there, we avoid comparing
  the actual data.
- If the non I/O / MMIO side is being written, it is the OSes
  responsibility to avoid actually moving page contents to disk while
  there might still be a write access in flight - this is no different
  in behavior from bare hardware.
- Read-modify-write accesses are, as always, complicated, and while we
  deal with them better nowadays than we did in the past, we're still
  not quite there to guarantee hardware like behavior in all cases
  anyway. Nothing is getting worse by the changes made here, afaict.

In __hvm_copy() also reduce p's scope and change its type to void *.

[1] Other than on actual hardware, actions like
    XEN_DOMCTL_sethvmcontext, XEN_DOMCTL_setvcpucontext,
    VCPUOP_initialise, INIT, or SIPI issued against the vCPU can occur
    while the vCPU is blocked waiting for a device model to return data.
    In such cases emulation now gets canceled, though, and hence re-
    execution correctness is unaffected.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: In principle the caching here yields unnecessary the one used for
     insn bytes (vio->mmio_insn{,_bytes}. However, to seed the cache
     with the data SVM may have made available, we'd have to also know
     the corresponding GPA. It's not safe, however, to re-walk the page
     tables to find out, as the page tables may have changed in the
     meantime. Therefore I guess we need to keep the duplicate
     functionality for now. A possible solution to this could be to use
     a physical-address-based cache for page table accesses (and looking
     forward also e.g. SVM/VMX insn emulation), and a linear-address-
     based one for all other reads.
---
v5: Re-arrange bitfield. Use domain_crash() in hvmemul_write_cache().
    Move hvmemul_{read,write}_cache() stubs to later patch. Also adjust
    hvmemul_cancel(). Add / extend comments. Re-base.
v4: Re-write for cache to become transparent to callers.
v3: Add text about the actual data page to the description.
v2: Re-base.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -28,6 +28,19 @@
 #include <asm/iocap.h>
 #include <asm/vm_event.h>
 
+struct hvmemul_cache
+{
+    /* The cache is disabled as long as num_ents > max_ents. */
+    unsigned int num_ents;
+    unsigned int max_ents;
+    struct {
+        paddr_t gpa:PADDR_BITS;
+        unsigned int :BITS_PER_LONG - PADDR_BITS - 8;
+        unsigned int size:8;
+        unsigned long data;
+    } ents[];
+};
+
 static void hvmtrace_io_assist(const ioreq_t *p)
 {
     unsigned int size, event;
@@ -136,6 +149,8 @@ void hvmemul_cancel(struct vcpu *v)
     vio->mmio_access = (struct npfec){};
     vio->mmio_retry = false;
     vio->g2m_ioport = NULL;
+
+    hvmemul_cache_disable(v);
 }
 
 static int hvmemul_do_io(
@@ -1883,12 +1898,17 @@ static int hvmemul_rep_movs(
         rc = HVMTRANS_okay;
     }
     else
+    {
+        unsigned int token = hvmemul_cache_disable(curr);
+
         /*
          * We do a modicum of checking here, just for paranoia's sake and to
          * definitely avoid copying an unitialised buffer into guest address
          * space.
          */
         rc = hvm_copy_from_guest_phys(buf, sgpa, bytes);
+        hvmemul_cache_restore(curr, token);
+    }
 
     if ( rc == HVMTRANS_okay )
         rc = hvm_copy_to_guest_phys(dgpa, buf, bytes, curr);
@@ -2551,6 +2571,19 @@ static int _hvm_emulate_one(struct hvm_e
     struct hvm_vcpu_io *vio = &curr->arch.hvm.hvm_io;
     int rc;
 
+    /*
+     * Enable caching if it's currently disabled, but leave the cache
+     * untouched if it's already enabled, for re-execution to consume
+     * entries populated by an earlier pass.
+     */
+    if ( vio->cache->num_ents > vio->cache->max_ents )
+    {
+        ASSERT(vio->io_req.state == STATE_IOREQ_NONE);
+        vio->cache->num_ents = 0;
+    }
+    else
+        ASSERT(vio->io_req.state == STATE_IORESP_READY);
+
     hvm_emulate_init_per_insn(hvmemul_ctxt, vio->mmio_insn,
                               vio->mmio_insn_bytes);
 
@@ -2564,6 +2597,7 @@ static int _hvm_emulate_one(struct hvm_e
     {
         vio->mmio_cache_count = 0;
         vio->mmio_insn_bytes = 0;
+        hvmemul_cache_disable(curr);
     }
     else
     {
@@ -2856,6 +2890,123 @@ void hvm_dump_emulation_state(const char
            hvmemul_ctxt->insn_buf);
 }
 
+int hvmemul_cache_init(struct vcpu *v)
+{
+    /*
+     * No insn can access more than 16 independent linear addresses (AVX512F
+     * scatters/gathers being the worst). Each such linear range can span a
+     * page boundary, i.e. may require two page walks. Account for each insn
+     * byte individually, for simplicity.
+     */
+    const unsigned int nents = (CONFIG_PAGING_LEVELS + 1) *
+                               (MAX_INST_LEN + 16 * 2);
+    struct hvmemul_cache *cache = xmalloc_flex_struct(struct hvmemul_cache,
+                                                      ents, nents);
+
+    if ( !cache )
+        return -ENOMEM;
+
+    /* Cache is disabled initially. */
+    cache->num_ents = nents + 1;
+    cache->max_ents = nents;
+
+    v->arch.hvm.hvm_io.cache = cache;
+
+    return 0;
+}
+
+unsigned int hvmemul_cache_disable(struct vcpu *v)
+{
+    struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
+    unsigned int token = cache->num_ents;
+
+    cache->num_ents = cache->max_ents + 1;
+
+    return token;
+}
+
+void hvmemul_cache_restore(struct vcpu *v, unsigned int token)
+{
+    struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
+
+    ASSERT(cache->num_ents > cache->max_ents);
+    cache->num_ents = token;
+}
+
+bool hvmemul_read_cache(const struct vcpu *v, paddr_t gpa,
+                        void *buffer, unsigned int size)
+{
+    const struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
+    unsigned int i;
+
+    /* Cache unavailable? */
+    if ( cache->num_ents > cache->max_ents )
+        return false;
+
+    while ( size > sizeof(cache->ents->data) )
+    {
+        i = gpa & (sizeof(cache->ents->data) - 1)
+            ? -gpa & (sizeof(cache->ents->data) - 1)
+            : sizeof(cache->ents->data);
+        if ( !hvmemul_read_cache(v, gpa, buffer, i) )
+            return false;
+        gpa += i;
+        buffer += i;
+        size -= i;
+    }
+
+    for ( i = 0; i < cache->num_ents; ++i )
+        if ( cache->ents[i].gpa == gpa && cache->ents[i].size == size )
+        {
+            memcpy(buffer, &cache->ents[i].data, size);
+            return true;
+        }
+
+    return false;
+}
+
+void hvmemul_write_cache(const struct vcpu *v, paddr_t gpa,
+                         const void *buffer, unsigned int size)
+{
+    struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
+    unsigned int i;
+
+    /* Cache unavailable? */
+    if ( cache->num_ents > cache->max_ents )
+        return;
+
+    while ( size > sizeof(cache->ents->data) )
+    {
+        i = gpa & (sizeof(cache->ents->data) - 1)
+            ? -gpa & (sizeof(cache->ents->data) - 1)
+            : sizeof(cache->ents->data);
+        hvmemul_write_cache(v, gpa, buffer, i);
+        gpa += i;
+        buffer += i;
+        size -= i;
+    }
+
+    for ( i = 0; i < cache->num_ents; ++i )
+        if ( cache->ents[i].gpa == gpa && cache->ents[i].size == size )
+        {
+            memcpy(&cache->ents[i].data, buffer, size);
+            return;
+        }
+
+    if ( unlikely(i >= cache->max_ents) )
+    {
+        domain_crash(v->domain);
+        return;
+    }
+
+    cache->ents[i].gpa  = gpa;
+    cache->ents[i].size = size;
+
+    memcpy(&cache->ents[i].data, buffer, size);
+
+    cache->num_ents = i + 1;
+}
+
 /*
  * Local variables:
  * mode: C
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -726,6 +726,8 @@ int hvm_domain_initialise(struct domain
 /* This function and all its descendants need to be to be idempotent. */
 void hvm_domain_relinquish_resources(struct domain *d)
 {
+    struct vcpu *v;
+
     if ( hvm_funcs.domain_relinquish_resources )
         alternative_vcall(hvm_funcs.domain_relinquish_resources, d);
 
@@ -742,6 +744,9 @@ void hvm_domain_relinquish_resources(str
     rtc_deinit(d);
     pmtimer_deinit(d);
     hpet_deinit(d);
+
+    for_each_vcpu ( d, v )
+        hvmemul_cache_destroy(v);
 }
 
 void hvm_domain_destroy(struct domain *d)
@@ -1549,6 +1554,10 @@ int hvm_vcpu_initialise(struct vcpu *v)
 
     v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
 
+    rc = hvmemul_cache_init(v);
+    if ( rc )
+        goto fail4;
+
     rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */
     if ( rc != 0 )
         goto fail4;
@@ -1584,6 +1593,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
  fail5:
     free_compat_arg_xlat(v);
  fail4:
+    hvmemul_cache_destroy(v);
     hvm_funcs.vcpu_destroy(v);
  fail3:
     vlapic_destroy(v);
@@ -2945,6 +2955,7 @@ void hvm_task_switch(
     unsigned int eflags, new_cpl;
     pagefault_info_t pfinfo;
     int exn_raised, rc;
+    unsigned int token = hvmemul_cache_disable(v);
     struct tss32 tss;
 
     hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
@@ -3152,6 +3163,8 @@ void hvm_task_switch(
  out:
     hvm_unmap_entry(optss_desc);
     hvm_unmap_entry(nptss_desc);
+
+    hvmemul_cache_restore(v, token);
 }
 
 enum hvm_translation_result hvm_translate_get_page(
@@ -3242,7 +3255,6 @@ static enum hvm_translation_result __hvm
     gfn_t gfn;
     struct page_info *page;
     p2m_type_t p2mt;
-    char *p;
     int count, todo = size;
 
     ASSERT(is_hvm_vcpu(v));
@@ -3290,11 +3302,17 @@ static enum hvm_translation_result __hvm
             return HVMTRANS_need_retry;
         }
 
-        p = __map_domain_page(page) + pgoff;
-
-        if ( flags & HVMCOPY_to_guest )
+        if ( (flags & HVMCOPY_to_guest) ||
+             !hvmemul_read_cache(v, gfn_to_gaddr(gfn) | pgoff, buf, count) )
         {
-            if ( p2m_is_discard_write(p2mt) )
+            void *p = __map_domain_page(page) + pgoff;
+
+            if ( !(flags & HVMCOPY_to_guest) )
+            {
+                memcpy(buf, p, count);
+                hvmemul_write_cache(v, gfn_to_gaddr(gfn) | pgoff, buf, count);
+            }
+            else if ( p2m_is_discard_write(p2mt) )
             {
                 static unsigned long lastpage;
 
@@ -3311,13 +3329,9 @@ static enum hvm_translation_result __hvm
                     memset(p, 0, count);
                 paging_mark_pfn_dirty(v->domain, _pfn(gfn_x(gfn)));
             }
-        }
-        else
-        {
-            memcpy(buf, p, count);
-        }
 
-        unmap_domain_page(p);
+            unmap_domain_page(p);
+        }
 
         addr += count;
         if ( buf )
--- a/xen/arch/x86/hvm/hypercall.c
+++ b/xen/arch/x86/hvm/hypercall.c
@@ -22,6 +22,7 @@
 #include <xen/hypercall.h>
 #include <xen/nospec.h>
 
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/support.h>
 
 static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
@@ -159,6 +160,7 @@ int hvm_hypercall(struct cpu_user_regs *
     struct domain *currd = curr->domain;
     int mode = hvm_guest_x86_mode(curr);
     unsigned long eax = regs->eax;
+    unsigned int token;
 
     switch ( mode )
     {
@@ -183,7 +185,18 @@ int hvm_hypercall(struct cpu_user_regs *
     }
 
     if ( (eax & 0x80000000) && is_viridian_domain(currd) )
-        return viridian_hypercall(regs);
+    {
+        int ret;
+
+        /* See comment below. */
+        token = hvmemul_cache_disable(curr);
+
+        ret = viridian_hypercall(regs);
+
+        hvmemul_cache_restore(curr, token);
+
+        return ret;
+    }
 
     BUILD_BUG_ON(ARRAY_SIZE(hvm_hypercall_table) >
                  ARRAY_SIZE(hypercall_args_table));
@@ -202,6 +215,12 @@ int hvm_hypercall(struct cpu_user_regs *
         return HVM_HCALL_completed;
     }
 
+    /*
+     * Caching is intended for instruction emulation only. Disable it
+     * for any accesses by hypercall argument copy-in / copy-out.
+     */
+    token = hvmemul_cache_disable(curr);
+
     curr->hcall_preempted = false;
 
     if ( mode == 8 )
@@ -295,6 +314,8 @@ int hvm_hypercall(struct cpu_user_regs *
 #endif
     }
 
+    hvmemul_cache_restore(curr, token);
+
     HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%lu -> %lx", eax, regs->rax);
 
     if ( curr->hcall_preempted )
--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -20,6 +20,7 @@
 #include <xen/types.h>
 #include <xen/sched.h>
 #include <asm/regs.h>
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/hvm.h>
 #include <asm/hvm/support.h>
 #include <asm/hvm/domain.h>
@@ -163,6 +164,9 @@ int hvm_process_io_intercept(const struc
         {
             if ( p->data_is_ptr )
             {
+                struct vcpu *curr = current;
+                unsigned int token = hvmemul_cache_disable(curr);
+
                 data = 0;
                 switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
                                                   p->size) )
@@ -179,9 +183,11 @@ int hvm_process_io_intercept(const struc
                     ASSERT_UNREACHABLE();
                     /* fall through */
                 default:
-                    domain_crash(current->domain);
+                    domain_crash(curr->domain);
                     return X86EMUL_UNHANDLEABLE;
                 }
+
+                hvmemul_cache_restore(curr, token);
             }
             else
                 data = p->data;
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1040,6 +1040,8 @@ void svm_vmenter_helper(const struct cpu
     struct vcpu *curr = current;
     struct vmcb_struct *vmcb = curr->arch.hvm.svm.vmcb;
 
+    ASSERT(hvmemul_cache_disabled(curr));
+
     svm_asid_handle_vmrun();
 
     if ( unlikely(tb_init_done) )
--- a/xen/arch/x86/hvm/vmsi.c
+++ b/xen/arch/x86/hvm/vmsi.c
@@ -35,6 +35,7 @@
 #include <xen/irq.h>
 #include <xen/vpci.h>
 #include <public/hvm/ioreq.h>
+#include <asm/hvm/emulate.h>
 #include <asm/hvm/io.h>
 #include <asm/hvm/vpic.h>
 #include <asm/hvm/vlapic.h>
@@ -607,6 +608,7 @@ void msix_write_completion(struct vcpu *
     if ( !ctrl_address && snoop_addr &&
          v->arch.hvm.hvm_io.msix_snoop_gpa )
     {
+        unsigned int token = hvmemul_cache_disable(v);
         const struct msi_desc *desc;
         uint32_t data;
 
@@ -621,6 +623,8 @@ void msix_write_completion(struct vcpu *
                                       sizeof(data)) == HVMTRANS_okay &&
              !(data & PCI_MSIX_VECTOR_BITMASK) )
             ctrl_address = snoop_addr;
+
+        hvmemul_cache_restore(v, token);
     }
 
     if ( !ctrl_address )
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -4362,6 +4362,8 @@ bool vmx_vmenter_helper(const struct cpu
     struct hvm_vcpu_asid *p_asid;
     bool_t need_flush;
 
+    ASSERT(hvmemul_cache_disabled(curr));
+
     /* Shadow EPTP can't be updated here because irqs are disabled */
      if ( nestedhvm_vcpu_in_guestmode(curr) && vcpu_nestedhvm(curr).stale_np2m )
          return false;
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -13,6 +13,7 @@
 #define __ASM_X86_HVM_EMULATE_H__
 
 #include <xen/err.h>
+#include <xen/sched.h>
 #include <asm/hvm/hvm.h>
 #include <asm/x86_emulate.h>
 
@@ -97,6 +98,39 @@ int hvmemul_do_pio_buffer(uint16_t port,
                           uint8_t dir,
                           void *buffer);
 
+#ifdef CONFIG_HVM
+/*
+ * The cache controlled by the functions below is not like an ordinary CPU
+ * cache, i.e. aiming to help performance, but a "secret store" which is
+ * needed for correctness.  The issue it helps addressing is the need for
+ * re-execution of an insn (after data was provided by a device model) to
+ * observe the exact same memory state, i.e. to specifically not observe any
+ * updates which may have occurred in the meantime by other agents.
+ * Therefore this cache gets
+ * - enabled when emulation of an insn starts,
+ * - disabled across processing secondary things like a hypercall resulting
+ *   from insn emulation,
+ * - disabled again when an emulated insn is known to not require any
+ *   further re-execution.
+ */
+int __must_check hvmemul_cache_init(struct vcpu *v);
+static inline void hvmemul_cache_destroy(struct vcpu *v)
+{
+    XFREE(v->arch.hvm.hvm_io.cache);
+}
+bool hvmemul_read_cache(const struct vcpu *, paddr_t gpa,
+                        void *buffer, unsigned int size);
+void hvmemul_write_cache(const struct vcpu *, paddr_t gpa,
+                         const void *buffer, unsigned int size);
+unsigned int hvmemul_cache_disable(struct vcpu *);
+void hvmemul_cache_restore(struct vcpu *, unsigned int token);
+/* For use in ASSERT()s only: */
+static inline bool hvmemul_cache_disabled(struct vcpu *v)
+{
+    return hvmemul_cache_disable(v) == hvmemul_cache_disable(v);
+}
+#endif
+
 void hvm_dump_emulation_state(const char *loglvl, const char *prefix,
                               struct hvm_emulate_ctxt *hvmemul_ctxt, int rc);
 
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -77,6 +77,8 @@ struct hvm_vcpu_io {
     /* For retries we shouldn't re-fetch the instruction. */
     unsigned int mmio_insn_bytes;
     unsigned char mmio_insn[16];
+    struct hvmemul_cache *cache;
+
     /*
      * For string instruction emulation we need to be able to signal a
      * necessary retry through other than function return codes.

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 2/4] x86/HVM: implement memory read caching for insn emulation

Posted by Tian, Kevin 5 years, 11 months ago

> From: Jan Beulich <jbeulich@suse.com>
> Sent: Tuesday, March 3, 2020 6:17 PM
> 
> Emulation requiring device model assistance uses a form of instruction
> re-execution, assuming that the second (and any further) pass takes
> exactly the same path. This is a valid assumption as far as use of CPU

ah, I was not aware of such form. I thought the emulation is split
into two phases: decoding and send i/o request to device model, and
then completing inst emulation with device model's response and
previously-decoded information... 

> registers goes (as those can't change without any other instruction
> executing in between [1]), but is wrong for memory accesses. In
> particular it has been observed that Windows might page out buffers
> underneath an instruction currently under emulation (hitting between two
> passes). If the first pass read a memory operand successfully, any
> subsequent pass needs to get to see the exact same value.
> 
> Introduce a cache to make sure above described assumption holds. This
> is a very simplistic implementation for now: Only exact matches are
> satisfied (no overlaps or partial reads or anything); this is sufficient
> for the immediate purpose of making re-execution an exact replay. The
> cache also won't be used just yet for guest page walks; that'll be the
> subject of a subsequent change.

a cache implies that the aforementioned two-pass problem is only
mitigated instead of completely fixed?

btw is there any performance impact from this patch?

> 
> With the cache being generally transparent to upper layers, but with it
> having limited capacity yet being required for correctness, certain
> users of hvm_copy_from_guest_*() need to disable caching temporarily,
> without invalidating the cache. Note that the adjustments here to
> hvm_hypercall() and hvm_task_switch() are benign at this point; they'll
> become relevant once we start to be able to emulate respective insns
> through the main emulator (and more changes will then likely be needed
> to nested code).
> 
> As to the actual data page in a problamtic scenario, there are a couple
> of aspects to take into consideration:
> - We must be talking about an insn accessing two locations (two memory
>   ones, one of which is MMIO, or a memory and an I/O one).
> - If the non I/O / MMIO side is being read, the re-read (if it occurs at
>   all) is having its result discarded, by taking the shortcut through
>   the first switch()'s STATE_IORESP_READY case in hvmemul_do_io(). Note
>   how, among all the re-issue sanity checks there, we avoid comparing
>   the actual data.
> - If the non I/O / MMIO side is being written, it is the OSes
>   responsibility to avoid actually moving page contents to disk while
>   there might still be a write access in flight - this is no different
>   in behavior from bare hardware.
> - Read-modify-write accesses are, as always, complicated, and while we
>   deal with them better nowadays than we did in the past, we're still
>   not quite there to guarantee hardware like behavior in all cases
>   anyway. Nothing is getting worse by the changes made here, afaict.
> 
> In __hvm_copy() also reduce p's scope and change its type to void *.
> 
> [1] Other than on actual hardware, actions like
>     XEN_DOMCTL_sethvmcontext, XEN_DOMCTL_setvcpucontext,
>     VCPUOP_initialise, INIT, or SIPI issued against the vCPU can occur
>     while the vCPU is blocked waiting for a device model to return data.
>     In such cases emulation now gets canceled, though, and hence re-
>     execution correctness is unaffected.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TBD: In principle the caching here yields unnecessary the one used for
>      insn bytes (vio->mmio_insn{,_bytes}. However, to seed the cache
>      with the data SVM may have made available, we'd have to also know
>      the corresponding GPA. It's not safe, however, to re-walk the page
>      tables to find out, as the page tables may have changed in the
>      meantime. Therefore I guess we need to keep the duplicate
>      functionality for now. A possible solution to this could be to use
>      a physical-address-based cache for page table accesses (and looking
>      forward also e.g. SVM/VMX insn emulation), and a linear-address-
>      based one for all other reads.
> ---
> v5: Re-arrange bitfield. Use domain_crash() in hvmemul_write_cache().
>     Move hvmemul_{read,write}_cache() stubs to later patch. Also adjust
>     hvmemul_cancel(). Add / extend comments. Re-base.
> v4: Re-write for cache to become transparent to callers.
> v3: Add text about the actual data page to the description.
> v2: Re-base.
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -28,6 +28,19 @@
>  #include <asm/iocap.h>
>  #include <asm/vm_event.h>
> 
> +struct hvmemul_cache
> +{
> +    /* The cache is disabled as long as num_ents > max_ents. */
> +    unsigned int num_ents;
> +    unsigned int max_ents;
> +    struct {
> +        paddr_t gpa:PADDR_BITS;
> +        unsigned int :BITS_PER_LONG - PADDR_BITS - 8;
> +        unsigned int size:8;
> +        unsigned long data;
> +    } ents[];
> +};
> +
>  static void hvmtrace_io_assist(const ioreq_t *p)
>  {
>      unsigned int size, event;
> @@ -136,6 +149,8 @@ void hvmemul_cancel(struct vcpu *v)
>      vio->mmio_access = (struct npfec){};
>      vio->mmio_retry = false;
>      vio->g2m_ioport = NULL;
> +
> +    hvmemul_cache_disable(v);
>  }
> 
>  static int hvmemul_do_io(
> @@ -1883,12 +1898,17 @@ static int hvmemul_rep_movs(
>          rc = HVMTRANS_okay;
>      }
>      else
> +    {
> +        unsigned int token = hvmemul_cache_disable(curr);
> +
>          /*
>           * We do a modicum of checking here, just for paranoia's sake and to
>           * definitely avoid copying an unitialised buffer into guest address
>           * space.
>           */
>          rc = hvm_copy_from_guest_phys(buf, sgpa, bytes);
> +        hvmemul_cache_restore(curr, token);
> +    }
> 
>      if ( rc == HVMTRANS_okay )
>          rc = hvm_copy_to_guest_phys(dgpa, buf, bytes, curr);
> @@ -2551,6 +2571,19 @@ static int _hvm_emulate_one(struct hvm_e
>      struct hvm_vcpu_io *vio = &curr->arch.hvm.hvm_io;
>      int rc;
> 
> +    /*
> +     * Enable caching if it's currently disabled, but leave the cache
> +     * untouched if it's already enabled, for re-execution to consume
> +     * entries populated by an earlier pass.
> +     */
> +    if ( vio->cache->num_ents > vio->cache->max_ents )
> +    {
> +        ASSERT(vio->io_req.state == STATE_IOREQ_NONE);
> +        vio->cache->num_ents = 0;
> +    }
> +    else
> +        ASSERT(vio->io_req.state == STATE_IORESP_READY);
> +
>      hvm_emulate_init_per_insn(hvmemul_ctxt, vio->mmio_insn,
>                                vio->mmio_insn_bytes);
> 
> @@ -2564,6 +2597,7 @@ static int _hvm_emulate_one(struct hvm_e
>      {
>          vio->mmio_cache_count = 0;
>          vio->mmio_insn_bytes = 0;
> +        hvmemul_cache_disable(curr);
>      }
>      else
>      {
> @@ -2856,6 +2890,123 @@ void hvm_dump_emulation_state(const char
>             hvmemul_ctxt->insn_buf);
>  }
> 
> +int hvmemul_cache_init(struct vcpu *v)
> +{
> +    /*
> +     * No insn can access more than 16 independent linear addresses
> (AVX512F
> +     * scatters/gathers being the worst). Each such linear range can span a
> +     * page boundary, i.e. may require two page walks. Account for each insn
> +     * byte individually, for simplicity.
> +     */
> +    const unsigned int nents = (CONFIG_PAGING_LEVELS + 1) *
> +                               (MAX_INST_LEN + 16 * 2);
> +    struct hvmemul_cache *cache = xmalloc_flex_struct(struct
> hvmemul_cache,
> +                                                      ents, nents);
> +
> +    if ( !cache )
> +        return -ENOMEM;
> +
> +    /* Cache is disabled initially. */
> +    cache->num_ents = nents + 1;
> +    cache->max_ents = nents;
> +
> +    v->arch.hvm.hvm_io.cache = cache;
> +
> +    return 0;
> +}
> +
> +unsigned int hvmemul_cache_disable(struct vcpu *v)
> +{
> +    struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
> +    unsigned int token = cache->num_ents;
> +
> +    cache->num_ents = cache->max_ents + 1;
> +
> +    return token;
> +}
> +
> +void hvmemul_cache_restore(struct vcpu *v, unsigned int token)
> +{
> +    struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
> +
> +    ASSERT(cache->num_ents > cache->max_ents);
> +    cache->num_ents = token;
> +}
> +
> +bool hvmemul_read_cache(const struct vcpu *v, paddr_t gpa,
> +                        void *buffer, unsigned int size)
> +{
> +    const struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
> +    unsigned int i;
> +
> +    /* Cache unavailable? */
> +    if ( cache->num_ents > cache->max_ents )
> +        return false;
> +
> +    while ( size > sizeof(cache->ents->data) )
> +    {
> +        i = gpa & (sizeof(cache->ents->data) - 1)
> +            ? -gpa & (sizeof(cache->ents->data) - 1)
> +            : sizeof(cache->ents->data);
> +        if ( !hvmemul_read_cache(v, gpa, buffer, i) )
> +            return false;
> +        gpa += i;
> +        buffer += i;
> +        size -= i;
> +    }
> +
> +    for ( i = 0; i < cache->num_ents; ++i )
> +        if ( cache->ents[i].gpa == gpa && cache->ents[i].size == size )
> +        {
> +            memcpy(buffer, &cache->ents[i].data, size);
> +            return true;
> +        }
> +
> +    return false;
> +}
> +
> +void hvmemul_write_cache(const struct vcpu *v, paddr_t gpa,
> +                         const void *buffer, unsigned int size)
> +{
> +    struct hvmemul_cache *cache = v->arch.hvm.hvm_io.cache;
> +    unsigned int i;
> +
> +    /* Cache unavailable? */
> +    if ( cache->num_ents > cache->max_ents )
> +        return;
> +
> +    while ( size > sizeof(cache->ents->data) )
> +    {
> +        i = gpa & (sizeof(cache->ents->data) - 1)
> +            ? -gpa & (sizeof(cache->ents->data) - 1)
> +            : sizeof(cache->ents->data);
> +        hvmemul_write_cache(v, gpa, buffer, i);
> +        gpa += i;
> +        buffer += i;
> +        size -= i;
> +    }
> +
> +    for ( i = 0; i < cache->num_ents; ++i )
> +        if ( cache->ents[i].gpa == gpa && cache->ents[i].size == size )
> +        {
> +            memcpy(&cache->ents[i].data, buffer, size);
> +            return;
> +        }
> +
> +    if ( unlikely(i >= cache->max_ents) )
> +    {
> +        domain_crash(v->domain);
> +        return;
> +    }
> +
> +    cache->ents[i].gpa  = gpa;
> +    cache->ents[i].size = size;
> +
> +    memcpy(&cache->ents[i].data, buffer, size);
> +
> +    cache->num_ents = i + 1;
> +}
> +
>  /*
>   * Local variables:
>   * mode: C
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -726,6 +726,8 @@ int hvm_domain_initialise(struct domain
>  /* This function and all its descendants need to be to be idempotent. */
>  void hvm_domain_relinquish_resources(struct domain *d)
>  {
> +    struct vcpu *v;
> +
>      if ( hvm_funcs.domain_relinquish_resources )
>          alternative_vcall(hvm_funcs.domain_relinquish_resources, d);
> 
> @@ -742,6 +744,9 @@ void hvm_domain_relinquish_resources(str
>      rtc_deinit(d);
>      pmtimer_deinit(d);
>      hpet_deinit(d);
> +
> +    for_each_vcpu ( d, v )
> +        hvmemul_cache_destroy(v);
>  }
> 
>  void hvm_domain_destroy(struct domain *d)
> @@ -1549,6 +1554,10 @@ int hvm_vcpu_initialise(struct vcpu *v)
> 
>      v->arch.hvm.inject_event.vector = HVM_EVENT_VECTOR_UNSET;
> 
> +    rc = hvmemul_cache_init(v);
> +    if ( rc )
> +        goto fail4;
> +
>      rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */
>      if ( rc != 0 )
>          goto fail4;
> @@ -1584,6 +1593,7 @@ int hvm_vcpu_initialise(struct vcpu *v)
>   fail5:
>      free_compat_arg_xlat(v);
>   fail4:
> +    hvmemul_cache_destroy(v);
>      hvm_funcs.vcpu_destroy(v);
>   fail3:
>      vlapic_destroy(v);
> @@ -2945,6 +2955,7 @@ void hvm_task_switch(
>      unsigned int eflags, new_cpl;
>      pagefault_info_t pfinfo;
>      int exn_raised, rc;
> +    unsigned int token = hvmemul_cache_disable(v);
>      struct tss32 tss;
> 
>      hvm_get_segment_register(v, x86_seg_gdtr, &gdt);
> @@ -3152,6 +3163,8 @@ void hvm_task_switch(
>   out:
>      hvm_unmap_entry(optss_desc);
>      hvm_unmap_entry(nptss_desc);
> +
> +    hvmemul_cache_restore(v, token);
>  }
> 
>  enum hvm_translation_result hvm_translate_get_page(
> @@ -3242,7 +3255,6 @@ static enum hvm_translation_result __hvm
>      gfn_t gfn;
>      struct page_info *page;
>      p2m_type_t p2mt;
> -    char *p;
>      int count, todo = size;
> 
>      ASSERT(is_hvm_vcpu(v));
> @@ -3290,11 +3302,17 @@ static enum hvm_translation_result __hvm
>              return HVMTRANS_need_retry;
>          }
> 
> -        p = __map_domain_page(page) + pgoff;
> -
> -        if ( flags & HVMCOPY_to_guest )
> +        if ( (flags & HVMCOPY_to_guest) ||
> +             !hvmemul_read_cache(v, gfn_to_gaddr(gfn) | pgoff, buf, count) )
>          {
> -            if ( p2m_is_discard_write(p2mt) )
> +            void *p = __map_domain_page(page) + pgoff;
> +
> +            if ( !(flags & HVMCOPY_to_guest) )
> +            {
> +                memcpy(buf, p, count);
> +                hvmemul_write_cache(v, gfn_to_gaddr(gfn) | pgoff, buf, count);
> +            }
> +            else if ( p2m_is_discard_write(p2mt) )
>              {
>                  static unsigned long lastpage;
> 
> @@ -3311,13 +3329,9 @@ static enum hvm_translation_result __hvm
>                      memset(p, 0, count);
>                  paging_mark_pfn_dirty(v->domain, _pfn(gfn_x(gfn)));
>              }
> -        }
> -        else
> -        {
> -            memcpy(buf, p, count);
> -        }
> 
> -        unmap_domain_page(p);
> +            unmap_domain_page(p);
> +        }
> 
>          addr += count;
>          if ( buf )
> --- a/xen/arch/x86/hvm/hypercall.c
> +++ b/xen/arch/x86/hvm/hypercall.c
> @@ -22,6 +22,7 @@
>  #include <xen/hypercall.h>
>  #include <xen/nospec.h>
> 
> +#include <asm/hvm/emulate.h>
>  #include <asm/hvm/support.h>
> 
>  static long hvm_memory_op(int cmd, XEN_GUEST_HANDLE_PARAM(void)
> arg)
> @@ -159,6 +160,7 @@ int hvm_hypercall(struct cpu_user_regs *
>      struct domain *currd = curr->domain;
>      int mode = hvm_guest_x86_mode(curr);
>      unsigned long eax = regs->eax;
> +    unsigned int token;
> 
>      switch ( mode )
>      {
> @@ -183,7 +185,18 @@ int hvm_hypercall(struct cpu_user_regs *
>      }
> 
>      if ( (eax & 0x80000000) && is_viridian_domain(currd) )
> -        return viridian_hypercall(regs);
> +    {
> +        int ret;
> +
> +        /* See comment below. */
> +        token = hvmemul_cache_disable(curr);
> +
> +        ret = viridian_hypercall(regs);
> +
> +        hvmemul_cache_restore(curr, token);
> +
> +        return ret;
> +    }
> 
>      BUILD_BUG_ON(ARRAY_SIZE(hvm_hypercall_table) >
>                   ARRAY_SIZE(hypercall_args_table));
> @@ -202,6 +215,12 @@ int hvm_hypercall(struct cpu_user_regs *
>          return HVM_HCALL_completed;
>      }
> 
> +    /*
> +     * Caching is intended for instruction emulation only. Disable it
> +     * for any accesses by hypercall argument copy-in / copy-out.
> +     */
> +    token = hvmemul_cache_disable(curr);
> +
>      curr->hcall_preempted = false;
> 
>      if ( mode == 8 )
> @@ -295,6 +314,8 @@ int hvm_hypercall(struct cpu_user_regs *
>  #endif
>      }
> 
> +    hvmemul_cache_restore(curr, token);
> +
>      HVM_DBG_LOG(DBG_LEVEL_HCALL, "hcall%lu -> %lx", eax, regs->rax);
> 
>      if ( curr->hcall_preempted )
> --- a/xen/arch/x86/hvm/intercept.c
> +++ b/xen/arch/x86/hvm/intercept.c
> @@ -20,6 +20,7 @@
>  #include <xen/types.h>
>  #include <xen/sched.h>
>  #include <asm/regs.h>
> +#include <asm/hvm/emulate.h>
>  #include <asm/hvm/hvm.h>
>  #include <asm/hvm/support.h>
>  #include <asm/hvm/domain.h>
> @@ -163,6 +164,9 @@ int hvm_process_io_intercept(const struc
>          {
>              if ( p->data_is_ptr )
>              {
> +                struct vcpu *curr = current;
> +                unsigned int token = hvmemul_cache_disable(curr);
> +
>                  data = 0;
>                  switch ( hvm_copy_from_guest_phys(&data, p->data + step * i,
>                                                    p->size) )
> @@ -179,9 +183,11 @@ int hvm_process_io_intercept(const struc
>                      ASSERT_UNREACHABLE();
>                      /* fall through */
>                  default:
> -                    domain_crash(current->domain);
> +                    domain_crash(curr->domain);
>                      return X86EMUL_UNHANDLEABLE;
>                  }
> +
> +                hvmemul_cache_restore(curr, token);
>              }
>              else
>                  data = p->data;
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -1040,6 +1040,8 @@ void svm_vmenter_helper(const struct cpu
>      struct vcpu *curr = current;
>      struct vmcb_struct *vmcb = curr->arch.hvm.svm.vmcb;
> 
> +    ASSERT(hvmemul_cache_disabled(curr));
> +
>      svm_asid_handle_vmrun();
> 
>      if ( unlikely(tb_init_done) )
> --- a/xen/arch/x86/hvm/vmsi.c
> +++ b/xen/arch/x86/hvm/vmsi.c
> @@ -35,6 +35,7 @@
>  #include <xen/irq.h>
>  #include <xen/vpci.h>
>  #include <public/hvm/ioreq.h>
> +#include <asm/hvm/emulate.h>
>  #include <asm/hvm/io.h>
>  #include <asm/hvm/vpic.h>
>  #include <asm/hvm/vlapic.h>
> @@ -607,6 +608,7 @@ void msix_write_completion(struct vcpu *
>      if ( !ctrl_address && snoop_addr &&
>           v->arch.hvm.hvm_io.msix_snoop_gpa )
>      {
> +        unsigned int token = hvmemul_cache_disable(v);
>          const struct msi_desc *desc;
>          uint32_t data;
> 
> @@ -621,6 +623,8 @@ void msix_write_completion(struct vcpu *
>                                        sizeof(data)) == HVMTRANS_okay &&
>               !(data & PCI_MSIX_VECTOR_BITMASK) )
>              ctrl_address = snoop_addr;
> +
> +        hvmemul_cache_restore(v, token);
>      }
> 
>      if ( !ctrl_address )
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -4362,6 +4362,8 @@ bool vmx_vmenter_helper(const struct cpu
>      struct hvm_vcpu_asid *p_asid;
>      bool_t need_flush;
> 
> +    ASSERT(hvmemul_cache_disabled(curr));
> +
>      /* Shadow EPTP can't be updated here because irqs are disabled */
>       if ( nestedhvm_vcpu_in_guestmode(curr) &&
> vcpu_nestedhvm(curr).stale_np2m )
>           return false;
> --- a/xen/include/asm-x86/hvm/emulate.h
> +++ b/xen/include/asm-x86/hvm/emulate.h
> @@ -13,6 +13,7 @@
>  #define __ASM_X86_HVM_EMULATE_H__
> 
>  #include <xen/err.h>
> +#include <xen/sched.h>
>  #include <asm/hvm/hvm.h>
>  #include <asm/x86_emulate.h>
> 
> @@ -97,6 +98,39 @@ int hvmemul_do_pio_buffer(uint16_t port,
>                            uint8_t dir,
>                            void *buffer);
> 
> +#ifdef CONFIG_HVM
> +/*
> + * The cache controlled by the functions below is not like an ordinary CPU
> + * cache, i.e. aiming to help performance, but a "secret store" which is
> + * needed for correctness.  The issue it helps addressing is the need for
> + * re-execution of an insn (after data was provided by a device model) to
> + * observe the exact same memory state, i.e. to specifically not observe any
> + * updates which may have occurred in the meantime by other agents.
> + * Therefore this cache gets
> + * - enabled when emulation of an insn starts,
> + * - disabled across processing secondary things like a hypercall resulting
> + *   from insn emulation,
> + * - disabled again when an emulated insn is known to not require any
> + *   further re-execution.
> + */
> +int __must_check hvmemul_cache_init(struct vcpu *v);
> +static inline void hvmemul_cache_destroy(struct vcpu *v)
> +{
> +    XFREE(v->arch.hvm.hvm_io.cache);
> +}
> +bool hvmemul_read_cache(const struct vcpu *, paddr_t gpa,
> +                        void *buffer, unsigned int size);
> +void hvmemul_write_cache(const struct vcpu *, paddr_t gpa,
> +                         const void *buffer, unsigned int size);
> +unsigned int hvmemul_cache_disable(struct vcpu *);
> +void hvmemul_cache_restore(struct vcpu *, unsigned int token);
> +/* For use in ASSERT()s only: */
> +static inline bool hvmemul_cache_disabled(struct vcpu *v)
> +{
> +    return hvmemul_cache_disable(v) == hvmemul_cache_disable(v);
> +}
> +#endif
> +
>  void hvm_dump_emulation_state(const char *loglvl, const char *prefix,
>                                struct hvm_emulate_ctxt *hvmemul_ctxt, int rc);
> 
> --- a/xen/include/asm-x86/hvm/vcpu.h
> +++ b/xen/include/asm-x86/hvm/vcpu.h
> @@ -77,6 +77,8 @@ struct hvm_vcpu_io {
>      /* For retries we shouldn't re-fetch the instruction. */
>      unsigned int mmio_insn_bytes;
>      unsigned char mmio_insn[16];
> +    struct hvmemul_cache *cache;
> +
>      /*
>       * For string instruction emulation we need to be able to signal a
>       * necessary retry through other than function return codes.
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

Re: [Xen-devel] [PATCH v5 2/4] x86/HVM: implement memory read caching for insn emulation

Posted by Jan Beulich 5 years, 11 months ago

On 10.03.2020 03:39, Tian, Kevin wrote:
>> From: Jan Beulich <jbeulich@suse.com>
>> Sent: Tuesday, March 3, 2020 6:17 PM
>>
>> Emulation requiring device model assistance uses a form of instruction
>> re-execution, assuming that the second (and any further) pass takes
>> exactly the same path. This is a valid assumption as far as use of CPU
> 
> ah, I was not aware of such form. I thought the emulation is split
> into two phases: decoding and send i/o request to device model, and
> then completing inst emulation with device model's response and
> previously-decoded information... 

In theory this would be an option, but would require storing quite
a bit more information to be able to resume without going through
decode again. Plus it's not decode alone which matters, page walks
during the execution phase, for example, also need to match.

>> registers goes (as those can't change without any other instruction
>> executing in between [1]), but is wrong for memory accesses. In
>> particular it has been observed that Windows might page out buffers
>> underneath an instruction currently under emulation (hitting between two
>> passes). If the first pass read a memory operand successfully, any
>> subsequent pass needs to get to see the exact same value.
>>
>> Introduce a cache to make sure above described assumption holds. This
>> is a very simplistic implementation for now: Only exact matches are
>> satisfied (no overlaps or partial reads or anything); this is sufficient
>> for the immediate purpose of making re-execution an exact replay. The
>> cache also won't be used just yet for guest page walks; that'll be the
>> subject of a subsequent change.
> 
> a cache implies that the aforementioned two-pass problem is only
> mitigated instead of completely fixed?

No, aiui the English word "cache" is broader than what it's typically
used for with computers in mind - see e.g. its use in "geocaching". I
realize the use of the word here may cause misunderstandings, but my
seek of a better term in earlier versions hasn't really led to any
suggestions I'd consider strictly better.

> btw is there any performance impact from this patch?

Since correctness is the goal, I didn't think I'd need to measure
things to support the utility of the changes.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 3/4] x86/mm: use cache in guest_walk_tables()

Posted by Jan Beulich 5 years, 11 months ago

Emulation requiring device model assistance uses a form of instruction
re-execution, assuming that the second (and any further) pass takes
exactly the same path. This is a valid assumption as far as use of CPU
registers goes (as those can't change without any other instruction
executing in between [1]), but is wrong for memory accesses. In
particular it has been observed that Windows might page out buffers
underneath an instruction currently under emulation (hitting between two
passes). If the first pass translated a linear address successfully, any
subsequent pass needs to do so too, yielding the exact same translation.
To guarantee this, leverage the caching that now backs HVM insn
emulation.

[1] Other than on actual hardware, actions like
    XEN_DOMCTL_sethvmcontext, XEN_DOMCTL_setvcpucontext,
    VCPUOP_initialise, INIT, or SIPI issued against the vCPU can occur
    while the vCPU is blocked waiting for a device model to return data.
    In such cases emulation now gets canceled, though, and hence re-
    execution correctness is unaffected.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: Move hvmemul_{read,write}_cache() stubs here.
v4: Adjust for cache now (elsewhere) being transparent to callers.
    Provide inline stubs for the !HVM case.
v2: Don't wrongly use top_gfn for non-root gpa calculation. Re-write
    cache entries after setting A/D bits (an alternative would be to
    suppress their setting upon cache hits).

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -2940,7 +2940,7 @@ bool hvmemul_read_cache(const struct vcp
     unsigned int i;
 
     /* Cache unavailable? */
-    if ( cache->num_ents > cache->max_ents )
+    if ( !is_hvm_vcpu(v) || cache->num_ents > cache->max_ents )
         return false;
 
     while ( size > sizeof(cache->ents->data) )
@@ -2972,7 +2972,7 @@ void hvmemul_write_cache(const struct vc
     unsigned int i;
 
     /* Cache unavailable? */
-    if ( cache->num_ents > cache->max_ents )
+    if ( !is_hvm_vcpu(v) || cache->num_ents > cache->max_ents )
         return;
 
     while ( size > sizeof(cache->ents->data) )
--- a/xen/arch/x86/mm/guest_walk.c
+++ b/xen/arch/x86/mm/guest_walk.c
@@ -31,6 +31,7 @@ asm(".file \"" __OBJECT_FILE__ "\"");
 #include <xen/sched.h>
 #include <asm/page.h>
 #include <asm/guest_pt.h>
+#include <asm/hvm/emulate.h>
 
 /*
  * Modify a guest pagetable entry to set the Accessed and Dirty bits.
@@ -80,9 +81,9 @@ static bool set_ad_bits(guest_intpte_t *
  * requested walk, to see whether the access is permitted.
  */
 bool
-guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
-                  unsigned long va, walk_t *gw,
-                  uint32_t walk, mfn_t top_mfn, void *top_map)
+guest_walk_tables(const struct vcpu *v, struct p2m_domain *p2m,
+                  unsigned long va, walk_t *gw, uint32_t walk,
+                  gfn_t top_gfn, mfn_t top_mfn, void *top_map)
 {
     struct domain *d = v->domain;
     guest_l1e_t *l1p = NULL;
@@ -90,8 +91,13 @@ guest_walk_tables(struct vcpu *v, struct
 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
     guest_l3e_t *l3p = NULL;
     guest_l4e_t *l4p;
+    paddr_t l4gpa;
+#endif
+#if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
+    paddr_t l3gpa;
 #endif
     uint32_t gflags, rc;
+    paddr_t l1gpa = 0, l2gpa = 0;
     unsigned int leaf_level;
     p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE;
 
@@ -132,7 +138,13 @@ guest_walk_tables(struct vcpu *v, struct
     /* Get the l4e from the top level table and check its flags*/
     gw->l4mfn = top_mfn;
     l4p = (guest_l4e_t *) top_map;
-    gw->l4e = l4p[guest_l4_table_offset(va)];
+    l4gpa = gfn_to_gaddr(top_gfn) +
+            guest_l4_table_offset(va) * sizeof(gw->l4e);
+    if ( !hvmemul_read_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e)) )
+    {
+        gw->l4e = l4p[guest_l4_table_offset(va)];
+        hvmemul_write_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e));
+    }
     gflags = guest_l4e_get_flags(gw->l4e);
     if ( !(gflags & _PAGE_PRESENT) )
         goto out;
@@ -161,7 +173,13 @@ guest_walk_tables(struct vcpu *v, struct
     }
 
     /* Get the l3e and check its flags*/
-    gw->l3e = l3p[guest_l3_table_offset(va)];
+    l3gpa = gfn_to_gaddr(guest_l4e_get_gfn(gw->l4e)) +
+            guest_l3_table_offset(va) * sizeof(gw->l3e);
+    if ( !hvmemul_read_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e)) )
+    {
+        gw->l3e = l3p[guest_l3_table_offset(va)];
+        hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
+    }
     gflags = guest_l3e_get_flags(gw->l3e);
     if ( !(gflags & _PAGE_PRESENT) )
         goto out;
@@ -213,7 +231,14 @@ guest_walk_tables(struct vcpu *v, struct
 #else /* PAE only... */
 
     /* Get the l3e and check its flag */
-    gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
+    l3gpa = gfn_to_gaddr(top_gfn) + ((unsigned long)top_map & ~PAGE_MASK) +
+            guest_l3_table_offset(va) * sizeof(gw->l3e);
+    if ( !hvmemul_read_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e)) )
+    {
+        gw->l3e = ((guest_l3e_t *)top_map)[guest_l3_table_offset(va)];
+        hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
+    }
+
     gflags = guest_l3e_get_flags(gw->l3e);
     if ( !(gflags & _PAGE_PRESENT) )
         goto out;
@@ -238,18 +263,24 @@ guest_walk_tables(struct vcpu *v, struct
         goto out;
     }
 
-    /* Get the l2e */
-    gw->l2e = l2p[guest_l2_table_offset(va)];
+    l2gpa = gfn_to_gaddr(guest_l3e_get_gfn(gw->l3e));
 
 #else /* 32-bit only... */
 
-    /* Get l2e from the top level table */
     gw->l2mfn = top_mfn;
     l2p = (guest_l2e_t *) top_map;
-    gw->l2e = l2p[guest_l2_table_offset(va)];
+    l2gpa = gfn_to_gaddr(top_gfn);
 
 #endif /* All levels... */
 
+    /* Get the l2e */
+    l2gpa += guest_l2_table_offset(va) * sizeof(gw->l2e);
+    if ( !hvmemul_read_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e)) )
+    {
+        gw->l2e = l2p[guest_l2_table_offset(va)];
+        hvmemul_write_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e));
+    }
+
     /* Check the l2e flags. */
     gflags = guest_l2e_get_flags(gw->l2e);
     if ( !(gflags & _PAGE_PRESENT) )
@@ -330,7 +361,15 @@ guest_walk_tables(struct vcpu *v, struct
         gw->pfec |= rc & PFEC_synth_mask;
         goto out;
     }
-    gw->l1e = l1p[guest_l1_table_offset(va)];
+
+    l1gpa = gfn_to_gaddr(guest_l2e_get_gfn(gw->l2e)) +
+            guest_l1_table_offset(va) * sizeof(gw->l1e);
+    if ( !hvmemul_read_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e)) )
+    {
+        gw->l1e = l1p[guest_l1_table_offset(va)];
+        hvmemul_write_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e));
+    }
+
     gflags = guest_l1e_get_flags(gw->l1e);
     if ( !(gflags & _PAGE_PRESENT) )
         goto out;
@@ -441,22 +480,34 @@ guest_walk_tables(struct vcpu *v, struct
     case 1:
         if ( set_ad_bits(&l1p[guest_l1_table_offset(va)].l1, &gw->l1e.l1,
                          (walk & PFEC_write_access)) )
+        {
             paging_mark_dirty(d, gw->l1mfn);
+            hvmemul_write_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e));
+        }
         /* Fallthrough */
     case 2:
         if ( set_ad_bits(&l2p[guest_l2_table_offset(va)].l2, &gw->l2e.l2,
                          (walk & PFEC_write_access) && leaf_level == 2) )
+        {
             paging_mark_dirty(d, gw->l2mfn);
+            hvmemul_write_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e));
+        }
         /* Fallthrough */
 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
     case 3:
         if ( set_ad_bits(&l3p[guest_l3_table_offset(va)].l3, &gw->l3e.l3,
                          (walk & PFEC_write_access) && leaf_level == 3) )
+        {
             paging_mark_dirty(d, gw->l3mfn);
+            hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
+        }
 
         if ( set_ad_bits(&l4p[guest_l4_table_offset(va)].l4, &gw->l4e.l4,
                          false) )
+        {
             paging_mark_dirty(d, gw->l4mfn);
+            hvmemul_write_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e));
+        }
 #endif
     }
 
--- a/xen/arch/x86/mm/hap/guest_walk.c
+++ b/xen/arch/x86/mm/hap/guest_walk.c
@@ -91,7 +91,8 @@ unsigned long hap_p2m_ga_to_gfn(GUEST_PA
 #if GUEST_PAGING_LEVELS == 3
     top_map += (cr3 & ~(PAGE_MASK | 31));
 #endif
-    walk_ok = guest_walk_tables(v, p2m, ga, &gw, *pfec, top_mfn, top_map);
+    walk_ok = guest_walk_tables(v, p2m, ga, &gw, *pfec,
+                                top_gfn, top_mfn, top_map);
     unmap_domain_page(top_map);
     put_page(top_page);
 
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -175,9 +175,13 @@ static inline bool
 sh_walk_guest_tables(struct vcpu *v, unsigned long va, walk_t *gw,
                      uint32_t pfec)
 {
+    gfn_t root_gfn = _gfn(paging_mode_external(v->domain)
+                          ? cr3_pa(v->arch.hvm.guest_cr[3]) >> PAGE_SHIFT
+                          : pagetable_get_pfn(v->arch.guest_table));
+
 #if GUEST_PAGING_LEVELS == 3 /* PAE */
     return guest_walk_tables(v, p2m_get_hostp2m(v->domain), va, gw, pfec,
-                             INVALID_MFN, v->arch.paging.shadow.gl3e);
+                             root_gfn, INVALID_MFN, v->arch.paging.shadow.gl3e);
 #else /* 32 or 64 */
     const struct domain *d = v->domain;
     mfn_t root_mfn = (v->arch.flags & TF_kernel_mode
@@ -185,7 +189,7 @@ sh_walk_guest_tables(struct vcpu *v, uns
                       : pagetable_get_mfn(v->arch.guest_table_user));
     void *root_map = map_domain_page(root_mfn);
     bool ok = guest_walk_tables(v, p2m_get_hostp2m(d), va, gw, pfec,
-                                root_mfn, root_map);
+                                root_gfn, root_mfn, root_map);
 
     unmap_domain_page(root_map);
 
--- a/xen/include/asm-x86/guest_pt.h
+++ b/xen/include/asm-x86/guest_pt.h
@@ -428,8 +428,9 @@ static inline unsigned int guest_walk_to
 #define guest_walk_tables GPT_RENAME(guest_walk_tables, GUEST_PAGING_LEVELS)
 
 bool
-guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m, unsigned long va,
-                  walk_t *gw, uint32_t pfec, mfn_t top_mfn, void *top_map);
+guest_walk_tables(const struct vcpu *v, struct p2m_domain *p2m,
+                  unsigned long va, walk_t *gw, uint32_t pfec,
+                  gfn_t top_gfn, mfn_t top_mfn, void *top_map);
 
 /* Pretty-print the contents of a guest-walk */
 static inline void print_gw(const walk_t *gw)
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -129,6 +129,12 @@ static inline bool hvmemul_cache_disable
 {
     return hvmemul_cache_disable(v) == hvmemul_cache_disable(v);
 }
+#else
+static inline bool hvmemul_read_cache(const struct vcpu *v, paddr_t gpa,
+                                      void *buf,
+                                      unsigned int size) { return false; }
+static inline void hvmemul_write_cache(const struct vcpu *v, paddr_t gpa,
+                                       const void *buf, unsigned int size) {}
 #endif
 
 void hvm_dump_emulation_state(const char *loglvl, const char *prefix,

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel

[Xen-devel] [PATCH v5 4/4] x86/HVM: __hvm_copy()'s size parameter is an unsigned quantity

Posted by Jan Beulich 5 years, 11 months ago

There are no negative sizes. Make the function's parameter as well as
that of its derivates "unsigned int". Similarly make its local "count"
variable "unsigned int", and drop "todo" altogether. Don't use min_t()
anymore to calculate "count". Restrict its scope as well as that of
other local variables of the function.

While at it I've also noticed that {copy_{from,to},clear}_user_hvm()
have been returning "unsigned long" for no apparent reason, as their
respective "size" parameters have already been "unsigned int". Adjust
this as well as a slightly wrong comment there at the same time.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
v5: New.

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3249,14 +3249,9 @@ enum hvm_translation_result hvm_translat
 #define HVMCOPY_phys       (0u<<2)
 #define HVMCOPY_linear     (1u<<2)
 static enum hvm_translation_result __hvm_copy(
-    void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags,
+    void *buf, paddr_t addr, unsigned int size, struct vcpu *v, unsigned int flags,
     uint32_t pfec, pagefault_info_t *pfinfo)
 {
-    gfn_t gfn;
-    struct page_info *page;
-    p2m_type_t p2mt;
-    int count, todo = size;
-
     ASSERT(is_hvm_vcpu(v));
 
     /*
@@ -3275,12 +3270,14 @@ static enum hvm_translation_result __hvm
         return HVMTRANS_unhandleable;
 #endif
 
-    while ( todo > 0 )
+    while ( size > 0 )
     {
+        struct page_info *page;
+        gfn_t gfn;
+        p2m_type_t p2mt;
         enum hvm_translation_result res;
         unsigned int pgoff = addr & ~PAGE_MASK;
-
-        count = min_t(int, PAGE_SIZE - pgoff, todo);
+        unsigned int count = min((unsigned int)PAGE_SIZE - pgoff, size);
 
         res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear,
                                      pfec, pfinfo, &page, &gfn, &p2mt);
@@ -3336,7 +3333,7 @@ static enum hvm_translation_result __hvm
         addr += count;
         if ( buf )
             buf += count;
-        todo -= count;
+        size -= count;
         put_page(page);
     }
 
@@ -3344,21 +3341,21 @@ static enum hvm_translation_result __hvm
 }
 
 enum hvm_translation_result hvm_copy_to_guest_phys(
-    paddr_t paddr, void *buf, int size, struct vcpu *v)
+    paddr_t paddr, void *buf, unsigned int size, struct vcpu *v)
 {
     return __hvm_copy(buf, paddr, size, v,
                       HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL);
 }
 
 enum hvm_translation_result hvm_copy_from_guest_phys(
-    void *buf, paddr_t paddr, int size)
+    void *buf, paddr_t paddr, unsigned int size)
 {
     return __hvm_copy(buf, paddr, size, current,
                       HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL);
 }
 
 enum hvm_translation_result hvm_copy_to_guest_linear(
-    unsigned long addr, void *buf, int size, uint32_t pfec,
+    unsigned long addr, void *buf, unsigned int size, uint32_t pfec,
     pagefault_info_t *pfinfo)
 {
     return __hvm_copy(buf, addr, size, current,
@@ -3367,7 +3364,7 @@ enum hvm_translation_result hvm_copy_to_
 }
 
 enum hvm_translation_result hvm_copy_from_guest_linear(
-    void *buf, unsigned long addr, int size, uint32_t pfec,
+    void *buf, unsigned long addr, unsigned int size, uint32_t pfec,
     pagefault_info_t *pfinfo)
 {
     return __hvm_copy(buf, addr, size, current,
@@ -3375,7 +3372,7 @@ enum hvm_translation_result hvm_copy_fro
                       PFEC_page_present | pfec, pfinfo);
 }
 
-unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int len)
+unsigned int copy_to_user_hvm(void *to, const void *from, unsigned int len)
 {
     int rc;
 
@@ -3389,7 +3386,7 @@ unsigned long copy_to_user_hvm(void *to,
     return rc ? len : 0; /* fake a copy_to_user() return code */
 }
 
-unsigned long clear_user_hvm(void *to, unsigned int len)
+unsigned int clear_user_hvm(void *to, unsigned int len)
 {
     int rc;
 
@@ -3400,10 +3397,11 @@ unsigned long clear_user_hvm(void *to, u
     }
 
     rc = hvm_copy_to_guest_linear((unsigned long)to, NULL, len, 0, NULL);
-    return rc ? len : 0; /* fake a copy_to_user() return code */
+
+    return rc ? len : 0; /* fake a clear_user() return code */
 }
 
-unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len)
+unsigned int copy_from_user_hvm(void *to, const void *from, unsigned int len)
 {
     int rc;
 
--- a/xen/include/asm-x86/hvm/guest_access.h
+++ b/xen/include/asm-x86/hvm/guest_access.h
@@ -1,8 +1,8 @@
 #ifndef __ASM_X86_HVM_GUEST_ACCESS_H__
 #define __ASM_X86_HVM_GUEST_ACCESS_H__
 
-unsigned long copy_to_user_hvm(void *to, const void *from, unsigned len);
-unsigned long clear_user_hvm(void *to, unsigned int len);
-unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len);
+unsigned int copy_to_user_hvm(void *to, const void *from, unsigned int len);
+unsigned int clear_user_hvm(void *to, unsigned int len);
+unsigned int copy_from_user_hvm(void *to, const void *from, unsigned int len);
 
 #endif /* __ASM_X86_HVM_GUEST_ACCESS_H__ */
--- a/xen/include/asm-x86/hvm/support.h
+++ b/xen/include/asm-x86/hvm/support.h
@@ -70,9 +70,9 @@ enum hvm_translation_result {
  * address range does not map entirely onto ordinary machine memory.
  */
 enum hvm_translation_result hvm_copy_to_guest_phys(
-    paddr_t paddr, void *buf, int size, struct vcpu *v);
+    paddr_t paddr, void *buf, unsigned int size, struct vcpu *v);
 enum hvm_translation_result hvm_copy_from_guest_phys(
-    void *buf, paddr_t paddr, int size);
+    void *buf, paddr_t paddr, unsigned int size);
 
 /*
  * Copy to/from a guest linear address. @pfec should include PFEC_user_mode
@@ -96,10 +96,10 @@ typedef struct pagefault_info
 } pagefault_info_t;
 
 enum hvm_translation_result hvm_copy_to_guest_linear(
-    unsigned long addr, void *buf, int size, uint32_t pfec,
+    unsigned long addr, void *buf, unsigned int size, uint32_t pfec,
     pagefault_info_t *pfinfo);
 enum hvm_translation_result hvm_copy_from_guest_linear(
-    void *buf, unsigned long addr, int size, uint32_t pfec,
+    void *buf, unsigned long addr, unsigned int size, uint32_t pfec,
     pagefault_info_t *pfinfo);
 
 /*


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xenproject.org
https://lists.xenproject.org/mailman/listinfo/xen-devel