[v3] Implementing a MSHV (Microsoft Hypervisor) accelerator

[PATCH v3 10/26] accel/mshv: Add vCPU creation and execution loop

Posted by Magnus Kulke 6 months ago

Create MSHV vCPUs using MSHV_CREATE_VP and initialize their state.
Register the MSHV CPU execution loop loop with the QEMU accelerator
framework to enable guest code execution.

The target/i386 functionality is still mostly stubbed out and will be
populated in a later commit in this series.

Signed-off-by: Magnus Kulke <magnuskulke@linux.microsoft.com>
---
 accel/mshv/mshv-all.c       | 187 +++++++++++++++++++++++++++++++++---
 accel/mshv/trace-events     |   2 +
 include/system/mshv.h       |  17 ++++
 target/i386/mshv/mshv-cpu.c |  63 ++++++++++++
 4 files changed, 257 insertions(+), 12 deletions(-)

diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c
index 54c32d6252..a4eeaeec76 100644
--- a/accel/mshv/mshv-all.c
+++ b/accel/mshv/mshv-all.c
@@ -391,6 +391,24 @@ int mshv_hvcall(int vm_fd, const struct mshv_root_hvcall *args)
     return ret;
 }
 
+static int mshv_init_vcpu(CPUState *cpu)
+{
+    int vm_fd = mshv_state->vm;
+    uint8_t vp_index = cpu->cpu_index;
+    int ret;
+
+    mshv_arch_init_vcpu(cpu);
+    cpu->accel = g_new0(AccelCPUState, 1);
+
+    ret = mshv_create_vcpu(vm_fd, vp_index, &cpu->accel->cpufd);
+    if (ret < 0) {
+        return -1;
+    }
+
+    cpu->accel->dirty = true;
+
+    return 0;
+}
 
 static int mshv_init(AccelState *as, MachineState *ms)
 {
@@ -413,6 +431,8 @@ static int mshv_init(AccelState *as, MachineState *ms)
         return -1;
     }
 
+    mshv_init_cpu_logic();
+
     mshv_init_msicontrol();
 
     ret = create_vm(mshv_fd, &vm_fd);
@@ -442,40 +462,183 @@ static int mshv_init(AccelState *as, MachineState *ms)
     return 0;
 }
 
+static int mshv_destroy_vcpu(CPUState *cpu)
+{
+    int cpu_fd = mshv_vcpufd(cpu);
+    int vm_fd = mshv_state->vm;
+
+    mshv_remove_vcpu(vm_fd, cpu_fd);
+    mshv_vcpufd(cpu) = 0;
+
+    mshv_arch_destroy_vcpu(cpu);
+    g_free(cpu->accel);
+    return 0;
+}
+
+static int mshv_cpu_exec(CPUState *cpu)
+{
+    hv_message mshv_msg;
+    enum MshvVmExit exit_reason;
+    int ret = 0;
+
+    bql_unlock();
+    cpu_exec_start(cpu);
+
+    do {
+        if (cpu->accel->dirty) {
+            ret = mshv_arch_put_registers(cpu);
+            if (ret) {
+                error_report("Failed to put registers after init: %s",
+                              strerror(-ret));
+                ret = -1;
+                break;
+            }
+            cpu->accel->dirty = false;
+        }
+
+        ret = mshv_run_vcpu(mshv_state->vm, cpu, &mshv_msg, &exit_reason);
+        if (ret < 0) {
+            error_report("Failed to run on vcpu %d", cpu->cpu_index);
+            abort();
+        }
+
+        switch (exit_reason) {
+        case MshvVmExitIgnore:
+            break;
+        default:
+            ret = EXCP_INTERRUPT;
+            break;
+        }
+    } while (ret == 0);
+
+    cpu_exec_end(cpu);
+    bql_lock();
+
+    if (ret < 0) {
+        cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
+        vm_stop(RUN_STATE_INTERNAL_ERROR);
+    }
+
+    qatomic_set(&cpu->exit_request, 0);
+    return ret;
+}
+
+static void *mshv_vcpu_thread(void *arg)
+{
+    CPUState *cpu = arg;
+    int ret;
+
+    rcu_register_thread();
+
+    bql_lock();
+    qemu_thread_get_self(cpu->thread);
+    cpu->thread_id = qemu_get_thread_id();
+    current_cpu = cpu;
+    ret = mshv_init_vcpu(cpu);
+    if (ret < 0) {
+        error_report("Failed to init vcpu %d", cpu->cpu_index);
+        goto cleanup;
+    }
+
+    /* signal CPU creation */
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    do {
+        if (cpu_can_run(cpu)) {
+            mshv_cpu_exec(cpu);
+        }
+        qemu_wait_io_event(cpu);
+    } while (!cpu->unplug || cpu_can_run(cpu));
+
+    mshv_destroy_vcpu(cpu);
+cleanup:
+    cpu_thread_signal_destroyed(cpu);
+    bql_unlock();
+    rcu_unregister_thread();
+    return NULL;
+}
+
 static void mshv_start_vcpu_thread(CPUState *cpu)
 {
-    error_report("unimplemented");
-    abort();
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+
+    cpu->thread = g_malloc0(sizeof(QemuThread));
+    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+
+    qemu_cond_init(cpu->halt_cond);
+
+    trace_mshv_start_vcpu_thread(thread_name, cpu->cpu_index);
+    qemu_thread_create(cpu->thread, thread_name, mshv_vcpu_thread, cpu,
+                       QEMU_THREAD_JOINABLE);
+}
+
+static void do_mshv_cpu_synchronize_post_init(CPUState *cpu,
+                                              run_on_cpu_data arg)
+{
+    int ret = mshv_arch_put_registers(cpu);
+    if (ret < 0) {
+        error_report("Failed to put registers after init: %s", strerror(-ret));
+        abort();
+    }
+
+    cpu->accel->dirty = false;
 }
 
 static void mshv_cpu_synchronize_post_init(CPUState *cpu)
 {
-    error_report("unimplemented");
-    abort();
+    run_on_cpu(cpu, do_mshv_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
 }
 
 static void mshv_cpu_synchronize_post_reset(CPUState *cpu)
 {
-    error_report("unimplemented");
-    abort();
+    int ret = mshv_arch_put_registers(cpu);
+    if (ret) {
+        error_report("Failed to put registers after reset: %s",
+                     strerror(-ret));
+        cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
+        vm_stop(RUN_STATE_INTERNAL_ERROR);
+    }
+    cpu->accel->dirty = false;
+}
+
+static void do_mshv_cpu_synchronize_pre_loadvm(CPUState *cpu,
+                                               run_on_cpu_data arg)
+{
+    cpu->accel->dirty = true;
 }
 
 static void mshv_cpu_synchronize_pre_loadvm(CPUState *cpu)
 {
-    error_report("unimplemented");
-    abort();
+    run_on_cpu(cpu, do_mshv_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
+}
+
+static void do_mshv_cpu_synchronize(CPUState *cpu, run_on_cpu_data arg)
+{
+    if (!cpu->accel->dirty) {
+        int ret = mshv_load_regs(cpu);
+        if (ret < 0) {
+            error_report("Failed to load registers for vcpu %d",
+                         cpu->cpu_index);
+
+            cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
+            vm_stop(RUN_STATE_INTERNAL_ERROR);
+        }
+
+        cpu->accel->dirty = true;
+    }
 }
 
 static void mshv_cpu_synchronize(CPUState *cpu)
 {
-    error_report("unimplemented");
-    abort();
+    if (!cpu->accel->dirty) {
+        run_on_cpu(cpu, do_mshv_cpu_synchronize, RUN_ON_CPU_NULL);
+    }
 }
 
 static bool mshv_cpus_are_resettable(void)
 {
-    error_report("unimplemented");
-    abort();
+    return false;
 }
 
 static void mshv_accel_class_init(ObjectClass *oc, const void *data)
diff --git a/accel/mshv/trace-events b/accel/mshv/trace-events
index 5ea5e74722..1b1b43a1e8 100644
--- a/accel/mshv/trace-events
+++ b/accel/mshv/trace-events
@@ -1,5 +1,7 @@
 # See docs/devel/tracing.rst for syntax documentation.
 
+mshv_start_vcpu_thread(const char* thread, uint32_t cpu) "thread=%s cpu_index=%d"
+
 mshv_set_memory(bool add, uint64_t gpa, uint64_t size, uint64_t user_addr, bool readonly, int ret) "add=%d gpa=0x%lx size=0x%lx user=0x%lx readonly=%d result=%d"
 mshv_mem_ioeventfd_add(uint64_t addr, uint32_t size, uint32_t data) "addr=0x%lx size=%d data=0x%x"
 mshv_mem_ioeventfd_del(uint64_t addr, uint32_t size, uint32_t data) "addr=0x%lx size=%d data=0x%x"
diff --git a/include/system/mshv.h b/include/system/mshv.h
index f2ffbe4ace..301228a813 100644
--- a/include/system/mshv.h
+++ b/include/system/mshv.h
@@ -72,12 +72,29 @@ typedef struct MshvMsiControl {
     GHashTable *gsi_routes;
 } MshvMsiControl;
 
+#define mshv_vcpufd(cpu) (cpu->accel->cpufd)
+
 #else /* CONFIG_MSHV_IS_POSSIBLE */
 #define mshv_enabled() false
 #endif
 #define mshv_msi_via_irqfd_enabled() mshv_enabled()
 
 /* cpu */
+typedef enum MshvVmExit {
+    MshvVmExitIgnore   = 0,
+    MshvVmExitShutdown = 1,
+    MshvVmExitSpecial  = 2,
+} MshvVmExit;
+
+void mshv_init_cpu_logic(void);
+int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd);
+void mshv_remove_vcpu(int vm_fd, int cpu_fd);
+int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit);
+int mshv_load_regs(CPUState *cpu);
+int mshv_store_regs(CPUState *cpu);
+int mshv_arch_put_registers(const CPUState *cpu);
+void mshv_arch_init_vcpu(CPUState *cpu);
+void mshv_arch_destroy_vcpu(CPUState *cpu);
 void mshv_arch_amend_proc_features(
     union hv_partition_synthetic_processor_features *features);
 int mshv_arch_post_init_vm(int vm_fd);
diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c
index c00e98dfba..2fe5319201 100644
--- a/target/i386/mshv/mshv-cpu.c
+++ b/target/i386/mshv/mshv-cpu.c
@@ -20,16 +20,79 @@
 #include "hw/hyperv/hvhdk_mini.h"
 #include "hw/hyperv/hvgdk.h"
 
+#include "cpu.h"
+#include "emulate/x86_decode.h"
+#include "emulate/x86_emu.h"
+#include "emulate/x86_flags.h"
 
 #include "trace-accel_mshv.h"
 #include "trace.h"
 
+int mshv_store_regs(CPUState *cpu)
+{
+    error_report("unimplemented");
+    abort();
+}
+
+int mshv_load_regs(CPUState *cpu)
+{
+    error_report("unimplemented");
+    abort();
+}
+
+int mshv_arch_put_registers(const CPUState *cpu)
+{
+    error_report("unimplemented");
+    abort();
+}
+
 void mshv_arch_amend_proc_features(
     union hv_partition_synthetic_processor_features *features)
 {
     features->access_guest_idle_reg = 1;
 }
 
+int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit)
+{
+    error_report("unimplemented");
+    abort();
+}
+
+void mshv_remove_vcpu(int vm_fd, int cpu_fd)
+{
+    error_report("unimplemented");
+    abort();
+}
+
+int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd)
+{
+    error_report("unimplemented");
+    abort();
+}
+
+void mshv_init_cpu_logic(void)
+{
+    error_report("unimplemented");
+    abort();
+}
+
+void mshv_arch_init_vcpu(CPUState *cpu)
+{
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+
+    env->emu_mmio_buf = g_new(char, 4096);
+}
+
+void mshv_arch_destroy_vcpu(CPUState *cpu)
+{
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+
+    g_free(env->emu_mmio_buf);
+    env->emu_mmio_buf = NULL;
+}
+
 /*
  * Default Microsoft Hypervisor behavior for unimplemented MSR is to send a
  * fault to the guest if it tries to access it. It is possible to override
-- 
2.34.1

Re: [PATCH v3 10/26] accel/mshv: Add vCPU creation and execution loop

Posted by Daniel P. Berrangé 5 months, 2 weeks ago

On Thu, Aug 07, 2025 at 04:39:35PM +0200, Magnus Kulke wrote:
> Create MSHV vCPUs using MSHV_CREATE_VP and initialize their state.
> Register the MSHV CPU execution loop loop with the QEMU accelerator
> framework to enable guest code execution.
> 
> The target/i386 functionality is still mostly stubbed out and will be
> populated in a later commit in this series.
> 
> Signed-off-by: Magnus Kulke <magnuskulke@linux.microsoft.com>
> ---
>  accel/mshv/mshv-all.c       | 187 +++++++++++++++++++++++++++++++++---
>  accel/mshv/trace-events     |   2 +
>  include/system/mshv.h       |  17 ++++
>  target/i386/mshv/mshv-cpu.c |  63 ++++++++++++
>  4 files changed, 257 insertions(+), 12 deletions(-)
> 
> diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c
> index 54c32d6252..a4eeaeec76 100644
> --- a/accel/mshv/mshv-all.c
> +++ b/accel/mshv/mshv-all.c
> @@ -391,6 +391,24 @@ int mshv_hvcall(int vm_fd, const struct mshv_root_hvcall *args)
>      return ret;
>  }
>  
> +static int mshv_init_vcpu(CPUState *cpu)
> +{
> +    int vm_fd = mshv_state->vm;
> +    uint8_t vp_index = cpu->cpu_index;
> +    int ret;
> +
> +    mshv_arch_init_vcpu(cpu);
> +    cpu->accel = g_new0(AccelCPUState, 1);
> +
> +    ret = mshv_create_vcpu(vm_fd, vp_index, &cpu->accel->cpufd);
> +    if (ret < 0) {
> +        return -1;
> +    }
> +
> +    cpu->accel->dirty = true;
> +
> +    return 0;
> +}
>  
>  static int mshv_init(AccelState *as, MachineState *ms)
>  {
> @@ -413,6 +431,8 @@ static int mshv_init(AccelState *as, MachineState *ms)
>          return -1;
>      }
>  
> +    mshv_init_cpu_logic();
> +
>      mshv_init_msicontrol();
>  
>      ret = create_vm(mshv_fd, &vm_fd);
> @@ -442,40 +462,183 @@ static int mshv_init(AccelState *as, MachineState *ms)
>      return 0;
>  }
>  
> +static int mshv_destroy_vcpu(CPUState *cpu)
> +{
> +    int cpu_fd = mshv_vcpufd(cpu);
> +    int vm_fd = mshv_state->vm;
> +
> +    mshv_remove_vcpu(vm_fd, cpu_fd);
> +    mshv_vcpufd(cpu) = 0;
> +
> +    mshv_arch_destroy_vcpu(cpu);
> +    g_free(cpu->accel);

Since the lifetime of the CPUState is not tightly
tied to the cpu->accel, I'd suggest that here
should use

  g_clear_pointer(&cpu->accel, g_free);

so that if there is any race with code accessing
cpu->accel after it is free'd, we'll get a clear
NULL de-reference, rather than use-after-free which
is harder to diagnose.


> +static void *mshv_vcpu_thread(void *arg)
> +{
> +    CPUState *cpu = arg;
> +    int ret;
> +
> +    rcu_register_thread();
> +
> +    bql_lock();
> +    qemu_thread_get_self(cpu->thread);
> +    cpu->thread_id = qemu_get_thread_id();

So every MSHV  vCPU has a corresponding Linux thread, similar
to the model with KVM.  In libvirt we rely on the vCPU thread
being controllable with all the normal Linux process related
APIs. For example, setting thread CPU affinity, setting NUMA
memory policy, setting scheduler priorities, putting threads
into cgroups and applying a wide variety of cgroup controls.

Will there be any significant "gotchas" with the threads for
MSHV vCPUs, that would mean the above libvirt controls would
either raise errors, or silently not have any effect ?


> +    current_cpu = cpu;
> +    ret = mshv_init_vcpu(cpu);
> +    if (ret < 0) {
> +        error_report("Failed to init vcpu %d", cpu->cpu_index);
> +        goto cleanup;
> +    }
> +
> +    /* signal CPU creation */
> +    cpu_thread_signal_created(cpu);
> +    qemu_guest_random_seed_thread_part2(cpu->random_seed);
> +
> +    do {
> +        if (cpu_can_run(cpu)) {
> +            mshv_cpu_exec(cpu);
> +        }
> +        qemu_wait_io_event(cpu);
> +    } while (!cpu->unplug || cpu_can_run(cpu));
> +
> +    mshv_destroy_vcpu(cpu);
> +cleanup:
> +    cpu_thread_signal_destroyed(cpu);
> +    bql_unlock();
> +    rcu_unregister_thread();
> +    return NULL;
> +}
> +
>  static void mshv_start_vcpu_thread(CPUState *cpu)
>  {
> -    error_report("unimplemented");
> -    abort();
> +    char thread_name[VCPU_THREAD_NAME_SIZE];
> +
> +    cpu->thread = g_malloc0(sizeof(QemuThread));

     = g_new0(QemuThread, 1);

> +    cpu->halt_cond = g_malloc0(sizeof(QemuCond));

     = g_new0(QemuCond, 1);

> +
> +    qemu_cond_init(cpu->halt_cond);
> +
> +    trace_mshv_start_vcpu_thread(thread_name, cpu->cpu_index);
> +    qemu_thread_create(cpu->thread, thread_name, mshv_vcpu_thread, cpu,
> +                       QEMU_THREAD_JOINABLE);
> +}



With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [PATCH v3 10/26] accel/mshv: Add vCPU creation and execution loop

Posted by Magnus Kulke 4 months, 3 weeks ago

On Wed, Aug 27, 2025 at 12:24:39PM +0100, Daniel P. Berrangé wrote:
> So every MSHV  vCPU has a corresponding Linux thread, similar
> to the model with KVM.  In libvirt we rely on the vCPU thread
> being controllable with all the normal Linux process related
> APIs. For example, setting thread CPU affinity, setting NUMA
> memory policy, setting scheduler priorities, putting threads
> into cgroups and applying a wide variety of cgroup controls.
> 
> Will there be any significant "gotchas" with the threads for
> MSHV vCPUs, that would mean the above libvirt controls would
> either raise errors, or silently not have any effect ?
> 

Hi Daniel,

I am not aware of any such gotchas. The MSHV vCPU threads should
be regular threads that spend most of their time blocked in
ioctl(MSHV_RUN) calls, and as such they should be controllable by
the facilities you mentioned. I know that that folks who tested this
code have been using numactl for reliable performance assessments
without running into issues.

best,

magnus

Re: [PATCH v3 10/26] accel/mshv: Add vCPU creation and execution loop

Posted by Wei Liu 5 months, 2 weeks ago

On Wed, Aug 27, 2025 at 12:24:39PM +0100, Daniel P. Berrangé wrote:
[...]
> 
> > +static void *mshv_vcpu_thread(void *arg)
> > +{
> > +    CPUState *cpu = arg;
> > +    int ret;
> > +
> > +    rcu_register_thread();
> > +
> > +    bql_lock();
> > +    qemu_thread_get_self(cpu->thread);
> > +    cpu->thread_id = qemu_get_thread_id();
> 
> So every MSHV  vCPU has a corresponding Linux thread, similar
> to the model with KVM.  In libvirt we rely on the vCPU thread
> being controllable with all the normal Linux process related
> APIs. For example, setting thread CPU affinity, setting NUMA
> memory policy, setting scheduler priorities, putting threads
> into cgroups and applying a wide variety of cgroup controls.
> 
> Will there be any significant "gotchas" with the threads for
> MSHV vCPUs, that would mean the above libvirt controls would
> either raise errors, or silently not have any effect ?
> 

It depends on the scheduling model of the host.

MSHV supports two scheduling models: hypervisor-based and root
partition. Root partition is the term we use to describe the host VM --
think of it like the Dom0 VM in Xen.

In the hypervisor-based scheduling model, the VCPUs are scheduled by the
hypervisor. The root partition merely tells the hypervisor "this VCPU is
ready to run", and the hypervisor decides when and where to actually run
it. In this model, the VCPU threads, when scheduled, are shown as
blocked. Libvirt controls over the threads won't fail but have no
effect.

The root partition scheduling model is where the root (Linux) can decide
where and when to run the VCPUs.  Everything you mentioned should work
as expected.

For the upcoming project, we are going to use the root scheduling model.

Thanks,
Wei

P.S. In the hpyervsior-based scheduling mode, the hypervisor does allow
us to set CPU affinity for VCPUs or group them (similar to cgroup but
not the same) by making some hypercalls. We've been thinking about
mapping those into libvirt controls, but haven't made good progress on
that front. It deserves its own discussion.