[v1] Implementing a MSHV (Microsoft Hypervisor) accelerator

[RFC PATCH 12/25] accel/mshv: Add vCPU creation and execution loop

Posted by Magnus Kulke 5 months, 4 weeks ago

Create MSHV vCPUs using MSHV_CREATE_VP and initialize their state.
Register the MSHV CPU execution loop loop with the QEMU accelerator
framework to enable guest code execution.

The target/i386 functionality is still mostly stubbed out and will be
populated in a later commit in this series.

Signed-off-by: Magnus Kulke <magnuskulke@linux.microsoft.com>
---
 accel/mshv/mshv-all.c       | 197 +++++++++++++++++++++++++++++++++---
 accel/mshv/trace-events     |   1 +
 include/system/mshv.h       |  19 ++++
 target/i386/mshv/mshv-cpu.c |  63 ++++++++++++
 4 files changed, 268 insertions(+), 12 deletions(-)

diff --git a/accel/mshv/mshv-all.c b/accel/mshv/mshv-all.c
index a29e356ba0..71fedc9137 100644
--- a/accel/mshv/mshv-all.c
+++ b/accel/mshv/mshv-all.c
@@ -400,6 +400,24 @@ int mshv_hvcall(int mshv_fd, const struct mshv_root_hvcall *args)
     return ret;
 }
 
+static int mshv_init_vcpu(CPUState *cpu)
+{
+    int vm_fd = mshv_state->vm;
+    uint8_t vp_index = cpu->cpu_index;
+    int ret;
+
+    mshv_arch_init_vcpu(cpu);
+    cpu->accel = g_new0(AccelCPUState, 1);
+
+    ret = mshv_create_vcpu(vm_fd, vp_index, &cpu->accel->cpufd);
+    if (ret < 0) {
+        return -1;
+    }
+
+    cpu->accel->dirty = true;
+
+    return 0;
+}
 
 static int mshv_init(MachineState *ms)
 {
@@ -417,6 +435,8 @@ static int mshv_init(MachineState *ms)
         return -1;
     }
 
+    mshv_init_cpu_logic();
+
     mshv_init_msicontrol();
 
     do {
@@ -440,40 +460,193 @@ static int mshv_init(MachineState *ms)
     return 0;
 }
 
+static int mshv_destroy_vcpu(CPUState *cpu)
+{
+    int cpu_fd = mshv_vcpufd(cpu);
+    int vm_fd = mshv_state->vm;
+
+    mshv_remove_vcpu(vm_fd, cpu_fd);
+    mshv_vcpufd(cpu) = 0;
+
+    mshv_arch_destroy_vcpu(cpu);
+    g_free(cpu->accel);
+    return 0;
+}
+
+static int mshv_cpu_exec(CPUState *cpu)
+{
+    hv_message mshv_msg;
+    enum MshvVmExit exit_reason;
+    int ret = 0;
+
+    bql_unlock();
+    cpu_exec_start(cpu);
+
+    do {
+        if (cpu->accel->dirty) {
+            ret = mshv_arch_put_registers(cpu);
+            if (ret) {
+                error_report("Failed to put registers after init: %s",
+                              strerror(-ret));
+                ret = -1;
+                break;
+            }
+            cpu->accel->dirty = false;
+        }
+
+        if (qatomic_read(&cpu->exit_request)) {
+            qemu_cpu_kick_self();
+        }
+
+        /*
+         * Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
+         * Matching barrier in kvm_eat_signals.
+         */
+        smp_rmb();
+
+        ret = mshv_run_vcpu(mshv_state->vm, cpu, &mshv_msg, &exit_reason);
+        if (ret < 0) {
+            error_report("Failed to run on vcpu %d", cpu->cpu_index);
+            abort();
+        }
+
+        switch (exit_reason) {
+        case MshvVmExitIgnore:
+            break;
+        default:
+            ret = EXCP_INTERRUPT;
+            break;
+        }
+    } while (ret == 0);
+
+    cpu_exec_end(cpu);
+    bql_lock();
+
+    if (ret < 0) {
+        cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
+        vm_stop(RUN_STATE_INTERNAL_ERROR);
+    }
+
+    qatomic_set(&cpu->exit_request, 0);
+    return ret;
+}
+
+static void *mshv_vcpu_thread(void *arg)
+{
+    CPUState *cpu = arg;
+    int ret;
+
+    rcu_register_thread();
+
+    bql_lock();
+    qemu_thread_get_self(cpu->thread);
+    cpu->thread_id = qemu_get_thread_id();
+    current_cpu = cpu;
+    ret = mshv_init_vcpu(cpu);
+    if (ret < 0) {
+        error_report("Failed to init vcpu %d", cpu->cpu_index);
+        goto cleanup;
+    }
+
+    /* signal CPU creation */
+    cpu_thread_signal_created(cpu);
+    qemu_guest_random_seed_thread_part2(cpu->random_seed);
+
+    do {
+        if (cpu_can_run(cpu)) {
+            mshv_cpu_exec(cpu);
+        }
+        qemu_wait_io_event(cpu);
+    } while (!cpu->unplug || cpu_can_run(cpu));
+
+    mshv_destroy_vcpu(cpu);
+cleanup:
+    cpu_thread_signal_destroyed(cpu);
+    bql_unlock();
+    rcu_unregister_thread();
+    return NULL;
+}
+
 static void mshv_start_vcpu_thread(CPUState *cpu)
 {
-	error_report("unimplemented");
-	abort();
+    char thread_name[VCPU_THREAD_NAME_SIZE];
+
+    cpu->thread = g_malloc0(sizeof(QemuThread));
+    cpu->halt_cond = g_malloc0(sizeof(QemuCond));
+
+    qemu_cond_init(cpu->halt_cond);
+
+    trace_mshv_start_vcpu_thread(thread_name, cpu->cpu_index);
+    qemu_thread_create(cpu->thread, thread_name, mshv_vcpu_thread, cpu,
+                       QEMU_THREAD_JOINABLE);
+}
+
+static void do_mshv_cpu_synchronize_post_init(CPUState *cpu,
+                                              run_on_cpu_data arg)
+{
+    int ret = mshv_arch_put_registers(cpu);
+    if (ret < 0) {
+        error_report("Failed to put registers after init: %s", strerror(-ret));
+        abort();
+    }
+
+    cpu->accel->dirty = false;
 }
 
 static void mshv_cpu_synchronize_post_init(CPUState *cpu)
 {
-	error_report("unimplemented");
-	abort();
+    run_on_cpu(cpu, do_mshv_cpu_synchronize_post_init, RUN_ON_CPU_NULL);
 }
 
 static void mshv_cpu_synchronize_post_reset(CPUState *cpu)
 {
-	error_report("unimplemented");
-	abort();
+    int ret = mshv_arch_put_registers(cpu);
+    if (ret) {
+        error_report("Failed to put registers after reset: %s",
+                     strerror(-ret));
+        cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
+        vm_stop(RUN_STATE_INTERNAL_ERROR);
+    }
+    cpu->accel->dirty = false;
+}
+
+static void do_mshv_cpu_synchronize_pre_loadvm(CPUState *cpu,
+                                               run_on_cpu_data arg)
+{
+    cpu->accel->dirty = true;
 }
 
 static void mshv_cpu_synchronize_pre_loadvm(CPUState *cpu)
 {
-	error_report("unimplemented");
-	abort();
+    run_on_cpu(cpu, do_mshv_cpu_synchronize_pre_loadvm, RUN_ON_CPU_NULL);
+}
+
+static void do_mshv_cpu_synchronize(CPUState *cpu, run_on_cpu_data arg)
+{
+    if (!cpu->accel->dirty) {
+        int ret = mshv_load_regs(cpu);
+        if (ret < 0) {
+            error_report("Failed to load registers for vcpu %d",
+                         cpu->cpu_index);
+
+            cpu_dump_state(cpu, stderr, CPU_DUMP_CODE);
+            vm_stop(RUN_STATE_INTERNAL_ERROR);
+        }
+
+        cpu->accel->dirty = true;
+    }
 }
 
 static void mshv_cpu_synchronize(CPUState *cpu)
 {
-	error_report("unimplemented");
-	abort();
+    if (!cpu->accel->dirty) {
+        run_on_cpu(cpu, do_mshv_cpu_synchronize, RUN_ON_CPU_NULL);
+    }
 }
 
 static bool mshv_cpus_are_resettable(void)
 {
-	error_report("unimplemented");
-	abort();
+    return false;
 }
 
 static void mshv_accel_class_init(ObjectClass *oc, const void *data)
diff --git a/accel/mshv/trace-events b/accel/mshv/trace-events
index beb5be7b73..06aa27ef67 100644
--- a/accel/mshv/trace-events
+++ b/accel/mshv/trace-events
@@ -1,5 +1,6 @@
 # See docs/devel/tracing.rst for syntax documentation.
 
+mshv_start_vcpu_thread(const char* thread, uint32_t cpu) "thread %s cpu_index %d"
 mshv_handle_interrupt(uint32_t cpu, int mask) "cpu_index %d mask %x"
 mshv_set_memory(bool add, uint64_t gpa, uint64_t size, uint64_t user_addr, bool readonly, int ret) "[add = %d] gpa = %lx size = %lx user = %lx readonly = %d result = %d"
 mshv_mem_ioeventfd_add(uint64_t addr, uint32_t size, uint32_t data) "addr %lx size %d data %x"
diff --git a/include/system/mshv.h b/include/system/mshv.h
index 4c1e901835..458b182077 100644
--- a/include/system/mshv.h
+++ b/include/system/mshv.h
@@ -32,6 +32,8 @@
 #define CONFIG_MSHV_IS_POSSIBLE
 #endif
 
+typedef struct hyperv_message hv_message;
+
 /*
  * Set to 0 if we do not want to use eventfd to optimize the MMIO events.
  * Set to 1 so that mshv kernel driver receives doorbell when the VM access
@@ -81,6 +83,8 @@ typedef struct MshvMsiControl {
     GHashTable *gsi_routes;
 } MshvMsiControl;
 
+#define mshv_vcpufd(cpu) (cpu->accel->cpufd)
+
 #else /* CONFIG_MSHV_IS_POSSIBLE */
 #define mshv_enabled() false
 #endif
@@ -95,6 +99,21 @@ typedef struct MshvMsiControl {
 #define EFER_LMA   ((uint64_t)0x400)
 #define EFER_LME   ((uint64_t)0x100)
 
+typedef enum MshvVmExit {
+    MshvVmExitIgnore   = 0,
+    MshvVmExitShutdown = 1,
+    MshvVmExitSpecial  = 2,
+} MshvVmExit;
+
+void mshv_init_cpu_logic(void);
+int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd);
+void mshv_remove_vcpu(int vm_fd, int cpu_fd);
+int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit);
+int mshv_load_regs(CPUState *cpu);
+int mshv_store_regs(CPUState *cpu);
+int mshv_arch_put_registers(const CPUState *cpu);
+void mshv_arch_init_vcpu(CPUState *cpu);
+void mshv_arch_destroy_vcpu(CPUState *cpu);
 void mshv_arch_amend_proc_features(
     union hv_partition_synthetic_processor_features *features);
 int mshv_arch_post_init_vm(int vm_fd);
diff --git a/target/i386/mshv/mshv-cpu.c b/target/i386/mshv/mshv-cpu.c
index b36f8904fb..c4b2c297e2 100644
--- a/target/i386/mshv/mshv-cpu.c
+++ b/target/i386/mshv/mshv-cpu.c
@@ -22,16 +22,79 @@
 #include "hw/hyperv/hvhdk_mini.h"
 #include "hw/hyperv/hvgdk.h"
 
+#include "cpu.h"
+#include "emulate/x86_decode.h"
+#include "emulate/x86_emu.h"
+#include "emulate/x86_flags.h"
 
 #include "trace-accel_mshv.h"
 #include "trace.h"
 
+int mshv_store_regs(CPUState *cpu)
+{
+	error_report("unimplemented");
+	abort();
+}
+
+int mshv_load_regs(CPUState *cpu)
+{
+	error_report("unimplemented");
+	abort();
+}
+
+int mshv_arch_put_registers(const CPUState *cpu)
+{
+	error_report("unimplemented");
+	abort();
+}
+
 void mshv_arch_amend_proc_features(
     union hv_partition_synthetic_processor_features *features)
 {
     features->access_guest_idle_reg = 1;
 }
 
+int mshv_run_vcpu(int vm_fd, CPUState *cpu, hv_message *msg, MshvVmExit *exit)
+{
+	error_report("unimplemented");
+	abort();
+}
+
+void mshv_remove_vcpu(int vm_fd, int cpu_fd)
+{
+	error_report("unimplemented");
+	abort();
+}
+
+int mshv_create_vcpu(int vm_fd, uint8_t vp_index, int *cpu_fd)
+{
+	error_report("unimplemented");
+	abort();
+}
+
+void mshv_init_cpu_logic(void)
+{
+	error_report("unimplemented");
+	abort();
+}
+
+void mshv_arch_init_vcpu(CPUState *cpu)
+{
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+
+    env->emu_mmio_buf = g_new(char, 4096);
+}
+
+void mshv_arch_destroy_vcpu(CPUState *cpu)
+{
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+
+    g_free(env->emu_mmio_buf);
+    env->emu_mmio_buf = NULL;
+}
+
 /*
  * Default Microsoft Hypervisor behavior for unimplemented MSR is to send a
  * fault to the guest if it tries to access it. It is possible to override
-- 
2.34.1

Re: [RFC PATCH 12/25] accel/mshv: Add vCPU creation and execution loop

Posted by Paolo Bonzini 5 months, 4 weeks ago

On 5/20/25 13:30, Magnus Kulke wrote:
> +    int ret;
> +    hv_message exit_msg = { 0 };

You probably don't want to fill 512 bytes on every vmentry.  Maybe pass 
&exit_msg up from mshv_cpu_exec()?

> +        /*
> +         * Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
> +         * Matching barrier in kvm_eat_signals.
> +         */
> +        smp_rmb();

The comment is obviously wrong; unfortunately, the code is wrong too:

1) qemu_cpu_kick_self() is only needed for an old KVM API.  In that API 
the signal handler is blocked while QEMU runs.  In your case, 
qemu_cpu_kick_self() is an expensive way to do nothing.

2) Because of this, there's a race condition between delivering the 
signal and entering MSHV_RUN_VP

You need support in the hypervisor for this: KVM and HVF both have it.

There are two ways to do it, for both cases the hypervisor side for the 
latter can be something like this:

diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
index 72df774e410a..627afece4046 100644
--- a/drivers/hv/mshv_root_main.c
+++ b/drivers/hv/mshv_root_main.c
@@ -530,7 +530,7 @@ static long mshv_run_vp_with_root_scheduler(
  		struct hv_output_dispatch_vp output;

  		ret = mshv_pre_guest_mode_work(vp);
-		if (ret)
+		if (ret || vp->run.flags.immediate_exit)
  			break;

  		if (vp->run.flags.intercept_suspend)
@@ -585,6 +585,7 @@
  		}
  	} while (!vp->run.flags.intercept_suspend);

+	vp->run.flags.immediate_exit = 0;
  	return ret;
  }

Instead of calling qemu_cpu_kick_self(), your signal handler would 
invoke a new MSHV ioctl that sets vp->run.flags.immediate_exit = 1.

And then you also don't need the barrier, by the way, because all 
inter-thread communication is mediated by the signal handler.

Paolo

Re: [RFC PATCH 12/25] accel/mshv: Add vCPU creation and execution loop

Posted by Nuno Das Neves 5 months, 1 week ago

On 5/20/2025 6:50 AM, Paolo Bonzini wrote:
> On 5/20/25 13:30, Magnus Kulke wrote:
>> +    int ret;
>> +    hv_message exit_msg = { 0 };
> 
> You probably don't want to fill 512 bytes on every vmentry.  Maybe pass &exit_msg up from mshv_cpu_exec()?
> 
>> +        /*
>> +         * Read cpu->exit_request before KVM_RUN reads run->immediate_exit.
>> +         * Matching barrier in kvm_eat_signals.
>> +         */
>> +        smp_rmb();
> 
> The comment is obviously wrong; unfortunately, the code is wrong too:
> 
> 1) qemu_cpu_kick_self() is only needed for an old KVM API.  In that API the signal handler is blocked while QEMU runs.  In your case, qemu_cpu_kick_self() is an expensive way to do nothing.
> 
> 2) Because of this, there's a race condition between delivering the signal and entering MSHV_RUN_VP
> 

Hi Paolo,

I might be misunderstanding something here, but isn't there a race condition regardless of where this check is made?
i.e., checking a flag in userspace, like the above:

if (qatomic_read(&cpu->exit_request)) {

vs checking the flag in the kernel, are effectively doing the same thing.
The signal can still come just after the check is made (in the kernel) and the VP will dispatch.

The virtual "explicit suspend" register in the VP seems to solve this problem - it can be used for manually kicking the VP
while it is running. But, it can also be set before dispatching the VP, and the dispatch hypercall will return immediately
in that case.

Thanks
Nuno

> You need support in the hypervisor for this: KVM and HVF both have it.
> 
> There are two ways to do it, for both cases the hypervisor side for the latter can be something like this:
> 
> diff --git a/drivers/hv/mshv_root_main.c b/drivers/hv/mshv_root_main.c
> index 72df774e410a..627afece4046 100644
> --- a/drivers/hv/mshv_root_main.c
> +++ b/drivers/hv/mshv_root_main.c
> @@ -530,7 +530,7 @@ static long mshv_run_vp_with_root_scheduler(
>          struct hv_output_dispatch_vp output;
> 
>          ret = mshv_pre_guest_mode_work(vp);
> -        if (ret)
> +        if (ret || vp->run.flags.immediate_exit)
>              break;
> 
>          if (vp->run.flags.intercept_suspend)
> @@ -585,6 +585,7 @@
>          }
>      } while (!vp->run.flags.intercept_suspend);
> 
> +    vp->run.flags.immediate_exit = 0;
>      return ret;
>  }
> 
> 
> Instead of calling qemu_cpu_kick_self(), your signal handler would invoke a new MSHV ioctl that sets vp->run.flags.immediate_exit = 1.
> 
> And then you also don't need the barrier, by the way, because all inter-thread communication is mediated by the signal handler.
> 
> Paolo
> 
> 
>

Re: [RFC PATCH 12/25] accel/mshv: Add vCPU creation and execution loop

Posted by Paolo Bonzini 5 months, 4 weeks ago

On 5/20/25 15:50, Paolo Bonzini wrote:
> You need support in the hypervisor for this: KVM and HVF both have it.
> 
> There are two ways to do it

Sorry - I left out the other way which is to pass something *into* 
MSHV_RUN_VP since only half of it is currently used (I think).  But 
that's more complicated; the advantage would be to avoid the ioctl in 
the signal handler but it's not a fast path.  I would just do it the 
easy way.

Paolo

Re: [RFC PATCH 12/25] accel/mshv: Add vCPU creation and execution loop

Posted by Wei Liu 5 months, 3 weeks ago

On Tue, May 20, 2025 at 03:54:57PM +0200, Paolo Bonzini wrote:
> On 5/20/25 15:50, Paolo Bonzini wrote:
> > You need support in the hypervisor for this: KVM and HVF both have it.
> > 
> > There are two ways to do it
> 
> Sorry - I left out the other way which is to pass something *into*
> MSHV_RUN_VP since only half of it is currently used (I think).  But that's
> more complicated; the advantage would be to avoid the ioctl in the signal
> handler but it's not a fast path.  I would just do it the easy way.

Thank you for the suggestions. We need some time to discuss kernel side
changes.

Thanks,
Wei.

> 
> Paolo
>