[RFC PATCH 03/25] target/i386/mshv: Add x86 decoder/emu implementation

Magnus Kulke posted 25 patches 5 months, 4 weeks ago
There is a newer version of this series
[RFC PATCH 03/25] target/i386/mshv: Add x86 decoder/emu implementation
Posted by Magnus Kulke 5 months, 4 weeks ago
The MSHV accelerator requires a x86 decoder/emulator in userland to
emulate MMIO instructions. This change contains the implementations for
the generalized i386 instruction decoder/emulator.

Signed-off-by: Magnus Kulke <magnuskulke@linux.microsoft.com>
---
 include/system/mshv.h           |  32 ++++
 target/i386/cpu.h               |   2 +-
 target/i386/emulate/meson.build |   7 +-
 target/i386/meson.build         |   2 +
 target/i386/mshv/meson.build    |   7 +
 target/i386/mshv/x86.c          | 330 ++++++++++++++++++++++++++++++++
 6 files changed, 377 insertions(+), 3 deletions(-)
 create mode 100644 include/system/mshv.h
 create mode 100644 target/i386/mshv/meson.build
 create mode 100644 target/i386/mshv/x86.c

diff --git a/include/system/mshv.h b/include/system/mshv.h
new file mode 100644
index 0000000000..8380b92da2
--- /dev/null
+++ b/include/system/mshv.h
@@ -0,0 +1,32 @@
+/*
+ * QEMU MSHV support
+ *
+ * Copyright Microsoft, Corp. 2025
+ *
+ * Authors:
+ *  Ziqiao Zhou       <ziqiaozhou@microsoft.com>
+ *  Magnus Kulke      <magnuskulke@microsoft.com>
+ *  Jinank Jain       <jinankjain@microsoft.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef QEMU_MSHV_INT_H
+#define QEMU_MSHV_INT_H
+
+#ifdef COMPILING_PER_TARGET
+#ifdef CONFIG_MSHV
+#define CONFIG_MSHV_IS_POSSIBLE
+#endif
+#else
+#define CONFIG_MSHV_IS_POSSIBLE
+#endif
+
+/* cpu */
+/* EFER (technically not a register) bits */
+#define EFER_LMA   ((uint64_t)0x400)
+#define EFER_LME   ((uint64_t)0x100)
+
+#endif
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 4f8ed8868e..db6a37b271 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -2101,7 +2101,7 @@ typedef struct CPUArchState {
     QEMUTimer *xen_periodic_timer;
     QemuMutex xen_timers_lock;
 #endif
-#if defined(CONFIG_HVF)
+#if defined(CONFIG_HVF) || defined(CONFIG_MSHV)
     X86LazyFlags lflags;
     void *emu_mmio_buf;
 #endif
diff --git a/target/i386/emulate/meson.build b/target/i386/emulate/meson.build
index 4edd4f462f..b6dafb6a5b 100644
--- a/target/i386/emulate/meson.build
+++ b/target/i386/emulate/meson.build
@@ -1,5 +1,8 @@
-i386_system_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: files(
+emulator_files = files(
   'x86_decode.c',
   'x86_emu.c',
   'x86_flags.c',
-))
+)
+
+i386_system_ss.add(when: [hvf, 'CONFIG_HVF'], if_true: emulator_files)
+i386_system_ss.add(when: 'CONFIG_MSHV', if_true: emulator_files)
diff --git a/target/i386/meson.build b/target/i386/meson.build
index c1aacea613..6097e5c427 100644
--- a/target/i386/meson.build
+++ b/target/i386/meson.build
@@ -11,6 +11,7 @@ i386_ss.add(when: 'CONFIG_SEV', if_true: files('host-cpu.c', 'confidential-guest
 # x86 cpu type
 i386_ss.add(when: 'CONFIG_KVM', if_true: files('host-cpu.c'))
 i386_ss.add(when: 'CONFIG_HVF', if_true: files('host-cpu.c'))
+i386_ss.add(when: 'CONFIG_MSHV', if_true: files('host-cpu.c'))
 
 i386_system_ss = ss.source_set()
 i386_system_ss.add(files(
@@ -32,6 +33,7 @@ subdir('nvmm')
 subdir('hvf')
 subdir('tcg')
 subdir('emulate')
+subdir('mshv')
 
 target_arch += {'i386': i386_ss}
 target_system_arch += {'i386': i386_system_ss}
diff --git a/target/i386/mshv/meson.build b/target/i386/mshv/meson.build
new file mode 100644
index 0000000000..8ddaa7c11d
--- /dev/null
+++ b/target/i386/mshv/meson.build
@@ -0,0 +1,7 @@
+i386_mshv_ss = ss.source_set()
+
+i386_mshv_ss.add(files(
+  'x86.c',
+))
+
+i386_system_ss.add_all(when: 'CONFIG_MSHV', if_true: i386_mshv_ss)
diff --git a/target/i386/mshv/x86.c b/target/i386/mshv/x86.c
new file mode 100644
index 0000000000..581710fd06
--- /dev/null
+++ b/target/i386/mshv/x86.c
@@ -0,0 +1,330 @@
+/*
+ * QEMU MSHV support
+ *
+ * Copyright Microsoft, Corp. 2025
+ *
+ * Authors:
+ *  Magnus Kulke      <magnuskulke@microsoft.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu/osdep.h"
+
+#include "cpu.h"
+#include "emulate/x86_decode.h"
+#include "emulate/x86_emu.h"
+#include "qemu/typedefs.h"
+#include "qemu/error-report.h"
+#include "system/mshv.h"
+
+/* RW or Exec segment */
+static const uint8_t RWRX_SEGMENT_TYPE        = 0x2;
+static const uint8_t CODE_SEGMENT_TYPE        = 0x8;
+static const uint8_t EXPAND_DOWN_SEGMENT_TYPE = 0x4;
+
+typedef enum CpuMode {
+    REAL_MODE,
+    PROTECTED_MODE,
+    LONG_MODE,
+} CpuMode;
+
+static CpuMode cpu_mode(CPUState *cpu)
+{
+    enum CpuMode m = REAL_MODE;
+
+    if (x86_is_protected(cpu)) {
+        m = PROTECTED_MODE;
+
+        if (x86_is_long_mode(cpu)) {
+            m = LONG_MODE;
+        }
+    }
+
+    return m;
+}
+
+static bool segment_type_ro(const SegmentCache *seg)
+{
+    uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15;
+    return (type_ & (~RWRX_SEGMENT_TYPE)) == 0;
+}
+
+static bool segment_type_code(const SegmentCache *seg)
+{
+    uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15;
+    return (type_ & CODE_SEGMENT_TYPE) != 0;
+}
+
+static bool segment_expands_down(const SegmentCache *seg)
+{
+    uint32_t type_ = (seg->flags >> DESC_TYPE_SHIFT) & 15;
+
+    if (segment_type_code(seg)) {
+        return false;
+    }
+
+    return (type_ & EXPAND_DOWN_SEGMENT_TYPE) != 0;
+}
+
+static uint32_t segment_limit(const SegmentCache *seg)
+{
+    uint32_t limit = seg->limit;
+    uint32_t granularity = (seg->flags & DESC_G_MASK) != 0;
+
+    if (granularity != 0) {
+        limit = (limit << 12) | 0xFFF;
+    }
+
+    return limit;
+}
+
+static uint8_t segment_db(const SegmentCache *seg)
+{
+    return (seg->flags >> DESC_B_SHIFT) & 1;
+}
+
+static uint32_t segment_max_limit(const SegmentCache *seg)
+{
+    if (segment_db(seg) != 0) {
+        return 0xFFFFFFFF;
+    }
+    return 0xFFFF;
+}
+
+static int linearize(CPUState *cpu,
+                     target_ulong logical_addr, target_ulong *linear_addr,
+                     X86Seg seg_idx)
+{
+    enum CpuMode mode;
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+    SegmentCache *seg = &env->segs[seg_idx];
+    target_ulong base = seg->base;
+    target_ulong logical_addr_32b;
+    uint32_t limit;
+    /* TODO: the emulator will not pass us "write" indicator yet */
+    bool write = false;
+
+    mode = cpu_mode(cpu);
+
+    switch (mode) {
+    case LONG_MODE:
+        if (__builtin_add_overflow(logical_addr, base, linear_addr)) {
+            error_report("Address overflow");
+            return -1;
+        }
+        break;
+    case PROTECTED_MODE:
+    case REAL_MODE:
+        if (segment_type_ro(seg) && write) {
+            error_report("Cannot write to read-only segment");
+            return -1;
+        }
+
+        logical_addr_32b = logical_addr & 0xFFFFFFFF;
+        limit = segment_limit(seg);
+
+        if (segment_expands_down(seg)) {
+            if (logical_addr_32b >= limit) {
+                error_report("Address exceeds limit (expands down)");
+                return -1;
+            }
+
+            limit = segment_max_limit(seg);
+        }
+
+        if (logical_addr_32b > limit) {
+            error_report("Address exceeds limit %u", limit);
+            return -1;
+        }
+        *linear_addr = logical_addr_32b + base;
+        break;
+    default:
+        error_report("Unknown cpu mode: %d", mode);
+        return -1;
+    }
+
+    return 0;
+}
+
+bool x86_read_segment_descriptor(CPUState *cpu,
+                                 struct x86_segment_descriptor *desc,
+                                 x86_segment_selector sel)
+{
+    target_ulong base;
+    uint32_t limit;
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+    target_ulong gva;
+    /* int ret; */
+
+    memset(desc, 0, sizeof(*desc));
+
+    /* valid gdt descriptors start from index 1 */
+    if (!sel.index && GDT_SEL == sel.ti) {
+        return false;
+    }
+
+    if (GDT_SEL == sel.ti) {
+        base = env->gdt.base;
+        limit = env->gdt.limit;
+    } else {
+        base = env->ldt.base;
+        limit = env->ldt.limit;
+    }
+
+    if (sel.index * 8 >= limit) {
+        return false;
+    }
+
+    gva = base + sel.index * 8;
+    emul_ops->read_mem(cpu, desc, gva, sizeof(*desc));
+
+    return true;
+}
+
+bool x86_write_segment_descriptor(CPUState *cpu,
+                                  struct x86_segment_descriptor *desc,
+                                  x86_segment_selector sel)
+{
+    target_ulong base;
+    uint32_t limit;
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+    /* int ret; */
+    target_ulong gva;
+
+    if (GDT_SEL == sel.ti) {
+        base = env->gdt.base;
+        limit = env->gdt.limit;
+    } else {
+        base = env->ldt.base;
+        limit = env->ldt.limit;
+    }
+
+    if (sel.index * 8 >= limit) {
+        return false;
+    }
+
+    gva = base + sel.index * 8;
+    emul_ops->write_mem(cpu, desc, gva, sizeof(*desc));
+
+    return true;
+}
+
+bool x86_read_call_gate(CPUState *cpu, struct x86_call_gate *idt_desc,
+                        int gate)
+{
+    target_ulong base;
+    uint32_t limit;
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+    target_ulong gva;
+
+    base = env->idt.base;
+    limit = env->idt.limit;
+
+    memset(idt_desc, 0, sizeof(*idt_desc));
+    if (gate * 8 >= limit) {
+        perror("call gate exceeds idt limit");
+        return false;
+    }
+
+    gva = base + gate * 8;
+    emul_ops->read_mem(cpu, idt_desc, gva, sizeof(*idt_desc));
+
+    return true;
+}
+
+bool x86_is_protected(CPUState *cpu)
+{
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+    uint64_t cr0 = env->cr[0];
+
+    return cr0 & CR0_PE_MASK;
+}
+
+bool x86_is_real(CPUState *cpu)
+{
+    return !x86_is_protected(cpu);
+}
+
+bool x86_is_v8086(CPUState *cpu)
+{
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+    return x86_is_protected(cpu) && (env->eflags & VM_MASK);
+}
+
+bool x86_is_long_mode(CPUState *cpu)
+{
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+    uint64_t efer = env->efer;
+
+    return ((efer & (EFER_LME | EFER_LMA)) == (EFER_LME | EFER_LMA));
+}
+
+bool x86_is_long64_mode(CPUState *cpu)
+{
+    error_report("unimplemented: is_long64_mode()");
+    abort();
+}
+
+bool x86_is_paging_mode(CPUState *cpu)
+{
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+    uint64_t cr0 = env->cr[0];
+
+    return cr0 & CR0_PG_MASK;
+}
+
+bool x86_is_pae_enabled(CPUState *cpu)
+{
+    X86CPU *x86_cpu = X86_CPU(cpu);
+    CPUX86State *env = &x86_cpu->env;
+    uint64_t cr4 = env->cr[4];
+
+    return cr4 & CR4_PAE_MASK;
+}
+
+target_ulong linear_addr(CPUState *cpu, target_ulong addr, X86Seg seg)
+{
+    int ret;
+    target_ulong linear_addr;
+
+    /* return vmx_read_segment_base(cpu, seg) + addr; */
+    ret = linearize(cpu, addr, &linear_addr, seg);
+    if (ret < 0) {
+        error_report("failed to linearize address");
+        abort();
+    }
+
+    return linear_addr;
+}
+
+target_ulong linear_addr_size(CPUState *cpu, target_ulong addr, int size,
+                              X86Seg seg)
+{
+    switch (size) {
+    case 2:
+        addr = (uint16_t)addr;
+        break;
+    case 4:
+        addr = (uint32_t)addr;
+        break;
+    default:
+        break;
+    }
+    return linear_addr(cpu, addr, seg);
+}
+
+target_ulong linear_rip(CPUState *cpu, target_ulong rip)
+{
+    return linear_addr(cpu, rip, R_CS);
+}
-- 
2.34.1
Re: [RFC PATCH 03/25] target/i386/mshv: Add x86 decoder/emu implementation
Posted by Wei Liu 5 months, 4 weeks ago
On Tue, May 20, 2025 at 01:29:56PM +0200, Magnus Kulke wrote:
> The MSHV accelerator requires a x86 decoder/emulator in userland to
> emulate MMIO instructions. This change contains the implementations for
> the generalized i386 instruction decoder/emulator.
> 
> Signed-off-by: Magnus Kulke <magnuskulke@linux.microsoft.com>
> ---
[...]
> +bool x86_read_segment_descriptor(CPUState *cpu,
> +                                 struct x86_segment_descriptor *desc,
> +                                 x86_segment_selector sel)
> +{
> +    target_ulong base;
> +    uint32_t limit;
> +    X86CPU *x86_cpu = X86_CPU(cpu);
> +    CPUX86State *env = &x86_cpu->env;
> +    target_ulong gva;
> +    /* int ret; */

Unused code. Please drop.

> +
> +    memset(desc, 0, sizeof(*desc));
> +
> +    /* valid gdt descriptors start from index 1 */
> +    if (!sel.index && GDT_SEL == sel.ti) {
> +        return false;
> +    }
> +
> +    if (GDT_SEL == sel.ti) {
> +        base = env->gdt.base;
> +        limit = env->gdt.limit;
> +    } else {
> +        base = env->ldt.base;
> +        limit = env->ldt.limit;
> +    }
> +
> +    if (sel.index * 8 >= limit) {
> +        return false;
> +    }
> +
> +    gva = base + sel.index * 8;
> +    emul_ops->read_mem(cpu, desc, gva, sizeof(*desc));
> +
> +    return true;
> +}
> +
> +bool x86_write_segment_descriptor(CPUState *cpu,
> +                                  struct x86_segment_descriptor *desc,
> +                                  x86_segment_selector sel)
> +{
> +    target_ulong base;
> +    uint32_t limit;
> +    X86CPU *x86_cpu = X86_CPU(cpu);
> +    CPUX86State *env = &x86_cpu->env;
> +    /* int ret; */

Unused code. Please drop.

> +    target_ulong gva;
> +
> +    if (GDT_SEL == sel.ti) {
> +        base = env->gdt.base;
> +        limit = env->gdt.limit;
> +    } else {
> +        base = env->ldt.base;
> +        limit = env->ldt.limit;
> +    }
> +
> +    if (sel.index * 8 >= limit) {
> +        return false;
> +    }
> +
> +    gva = base + sel.index * 8;
> +    emul_ops->write_mem(cpu, desc, gva, sizeof(*desc));
> +
> +    return true;
> +}
> +
[...]
> +
> +target_ulong linear_addr(CPUState *cpu, target_ulong addr, X86Seg seg)
> +{
> +    int ret;
> +    target_ulong linear_addr;
> +
> +    /* return vmx_read_segment_base(cpu, seg) + addr; */

Unused code.

Thanks,
Wei.

> +    ret = linearize(cpu, addr, &linear_addr, seg);
> +    if (ret < 0) {
> +        error_report("failed to linearize address");
> +        abort();
> +    }
> +
> +    return linear_addr;
> +}
> +
> +target_ulong linear_addr_size(CPUState *cpu, target_ulong addr, int size,
> +                              X86Seg seg)
> +{
> +    switch (size) {
> +    case 2:
> +        addr = (uint16_t)addr;
> +        break;
> +    case 4:
> +        addr = (uint32_t)addr;
> +        break;
> +    default:
> +        break;
> +    }
> +    return linear_addr(cpu, addr, seg);
> +}
> +
> +target_ulong linear_rip(CPUState *cpu, target_ulong rip)
> +{
> +    return linear_addr(cpu, rip, R_CS);
> +}
> -- 
> 2.34.1
>
Re: [RFC PATCH 03/25] target/i386/mshv: Add x86 decoder/emu implementation
Posted by Paolo Bonzini 5 months, 4 weeks ago
On 5/20/25 13:29, Magnus Kulke wrote:
> +/* cpu */
> +/* EFER (technically not a register) bits */
> +#define EFER_LMA   ((uint64_t)0x400)
> +#define EFER_LME   ((uint64_t)0x100)

There's already MSR_EFER_LMA and MSR_EFER_LME, please use them.
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 4f8ed8868e..db6a37b271 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -2101,7 +2101,7 @@ typedef struct CPUArchState {
>      QEMUTimer *xen_periodic_timer;
>      QemuMutex xen_timers_lock;
>  #endif
> -#if defined(CONFIG_HVF)
> +#if defined(CONFIG_HVF) || defined(CONFIG_MSHV)
>      X86LazyFlags lflags;
>      void *emu_mmio_buf;
>  #endif

Please rebase since the lflags member was removed.

> +bool x86_write_segment_descriptor(CPUState *cpu,
> +                                  struct x86_segment_descriptor *desc,
> +                                  x86_segment_selector sel)

There are a bunch of unused functions such as this one.

Paolo
Re: [RFC PATCH 03/25] target/i386/mshv: Add x86 decoder/emu implementation
Posted by Daniel P. Berrangé 5 months, 4 weeks ago
On Tue, May 20, 2025 at 01:29:56PM +0200, Magnus Kulke wrote:
> The MSHV accelerator requires a x86 decoder/emulator in userland to
> emulate MMIO instructions. This change contains the implementations for
> the generalized i386 instruction decoder/emulator.
> 
> Signed-off-by: Magnus Kulke <magnuskulke@linux.microsoft.com>
> ---
>  include/system/mshv.h           |  32 ++++
>  target/i386/cpu.h               |   2 +-
>  target/i386/emulate/meson.build |   7 +-
>  target/i386/meson.build         |   2 +
>  target/i386/mshv/meson.build    |   7 +
>  target/i386/mshv/x86.c          | 330 ++++++++++++++++++++++++++++++++
>  6 files changed, 377 insertions(+), 3 deletions(-)
>  create mode 100644 include/system/mshv.h
>  create mode 100644 target/i386/mshv/meson.build
>  create mode 100644 target/i386/mshv/x86.c
> 
> diff --git a/include/system/mshv.h b/include/system/mshv.h
> new file mode 100644
> index 0000000000..8380b92da2
> --- /dev/null
> +++ b/include/system/mshv.h
> @@ -0,0 +1,32 @@
> +/*
> + * QEMU MSHV support
> + *
> + * Copyright Microsoft, Corp. 2025
> + *
> + * Authors:
> + *  Ziqiao Zhou       <ziqiaozhou@microsoft.com>
> + *  Magnus Kulke      <magnuskulke@microsoft.com>
> + *  Jinank Jain       <jinankjain@microsoft.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.

FYI, for new files we now require use of SPDX-License-Identifier,
and omission of any manually written license boilerplate text.

checkpatch.pl is supposed to be warning about this, but it is
buggy & incomplete right now, but fixes for that are pending
to correctly warn about this.

With regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|