:p
atchew
Login
Notify VM exit is introduced to mitigate the potential DOS attach from malicious VM. This series is the userspace part to enable this feature through a new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT. The corresponding KVM patch series is available at https://lore.kernel.org/lkml/20220310084001.10235-1-chenyi.qiang@intel.com/ Chenyi Qiang (2): linux-headers: Sync the linux headers i386: Add notify VM exit support hw/i386/x86.c | 24 +++++++++++++++ include/hw/i386/x86.h | 3 ++ linux-headers/asm-x86/kvm.h | 4 +++ linux-headers/linux/kvm.h | 29 +++++++++++++++---- target/i386/kvm/kvm.c | 58 ++++++++++++++++++++++++------------- 5 files changed, 93 insertions(+), 25 deletions(-) -- 2.17.1
Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> --- linux-headers/asm-x86/kvm.h | 4 ++++ linux-headers/linux/kvm.h | 29 ++++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/linux-headers/asm-x86/kvm.h b/linux-headers/asm-x86/kvm.h index XXXXXXX..XXXXXXX 100644 --- a/linux-headers/asm-x86/kvm.h +++ b/linux-headers/asm-x86/kvm.h @@ -XXX,XX +XXX,XX @@ struct kvm_reinject_control { #define KVM_VCPUEVENT_VALID_SHADOW 0x00000004 #define KVM_VCPUEVENT_VALID_SMM 0x00000008 #define KVM_VCPUEVENT_VALID_PAYLOAD 0x00000010 +#define KVM_VCPUEVENT_SHUTDOWN 0x00000020 /* Interrupt shadow states */ #define KVM_X86_SHADOW_INT_MOV_SS 0x01 @@ -XXX,XX +XXX,XX @@ struct kvm_sync_regs { #define KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE 0x00000001 +/* attributes for system fd (group 0) */ +#define KVM_X86_XCOMP_GUEST_SUPP 0 + struct kvm_vmx_nested_state_data { __u8 vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; __u8 shadow_vmcs12[KVM_STATE_NESTED_VMX_VMCS_SIZE]; diff --git a/linux-headers/linux/kvm.h b/linux-headers/linux/kvm.h index XXXXXXX..XXXXXXX 100644 --- a/linux-headers/linux/kvm.h +++ b/linux-headers/linux/kvm.h @@ -XXX,XX +XXX,XX @@ struct kvm_xen_exit { #define KVM_EXIT_X86_BUS_LOCK 33 #define KVM_EXIT_XEN 34 #define KVM_EXIT_RISCV_SBI 35 +#define KVM_EXIT_NOTIFY 36 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -XXX,XX +XXX,XX @@ struct kvm_run { unsigned long args[6]; unsigned long ret[2]; } riscv_sbi; + /* KVM_EXIT_NOTIFY */ + struct { +#define KVM_NOTIFY_CONTEXT_INVALID (1 << 0) + __u32 data; + } notify; /* Fix the size of the union. */ char padding[256]; }; @@ -XXX,XX +XXX,XX @@ struct kvm_s390_mem_op { __u32 op; /* type of operation */ __u64 buf; /* buffer in userspace */ union { - __u8 ar; /* the access register number */ + struct { + __u8 ar; /* the access register number */ + __u8 key; /* access key, ignored if flag unset */ + }; __u32 sida_offset; /* offset into the sida */ - __u8 reserved[32]; /* should be set to 0 */ + __u8 reserved[32]; /* ignored */ }; }; /* types for kvm_s390_mem_op->op */ @@ -XXX,XX +XXX,XX @@ struct kvm_s390_mem_op { #define KVM_S390_MEMOP_LOGICAL_WRITE 1 #define KVM_S390_MEMOP_SIDA_READ 2 #define KVM_S390_MEMOP_SIDA_WRITE 3 +#define KVM_S390_MEMOP_ABSOLUTE_READ 4 +#define KVM_S390_MEMOP_ABSOLUTE_WRITE 5 /* flags for kvm_s390_mem_op->flags */ #define KVM_S390_MEMOP_F_CHECK_ONLY (1ULL << 0) #define KVM_S390_MEMOP_F_INJECT_EXCEPTION (1ULL << 1) +#define KVM_S390_MEMOP_F_SKEY_PROTECTION (1ULL << 2) /* for KVM_INTERRUPT */ struct kvm_interrupt { @@ -XXX,XX +XXX,XX @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 #define KVM_CAP_VM_GPA_BITS 207 #define KVM_CAP_XSAVE2 208 +#define KVM_CAP_SYS_ATTRIBUTES 209 +#define KVM_CAP_PPC_AIL_MODE_3 210 +#define KVM_CAP_S390_MEM_OP_EXTENSION 211 +#define KVM_CAP_PMU_CAPABILITY 212 +#define KVM_CAP_X86_NOTIFY_VMEXIT 213 #ifdef KVM_CAP_IRQ_ROUTING @@ -XXX,XX +XXX,XX @@ struct kvm_enc_region { #define KVM_S390_NORMAL_RESET _IO(KVMIO, 0xc3) #define KVM_S390_CLEAR_RESET _IO(KVMIO, 0xc4) -/* Available with KVM_CAP_XSAVE2 */ -#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) - struct kvm_s390_pv_sec_parm { __u64 origin; __u64 length; @@ -XXX,XX +XXX,XX @@ struct kvm_dirty_gfn { #define KVM_BUS_LOCK_DETECTION_OFF (1 << 0) #define KVM_BUS_LOCK_DETECTION_EXIT (1 << 1) +#define KVM_PMU_CAP_DISABLE (1 << 0) + /** * struct kvm_stats_header - Header of per vm/vcpu binary statistics data. * @flags: Some extra information for header, always 0 for now. @@ -XXX,XX +XXX,XX @@ struct kvm_stats_desc { #define KVM_GET_STATS_FD _IO(KVMIO, 0xce) +/* Available with KVM_CAP_XSAVE2 */ +#define KVM_GET_XSAVE2 _IOR(KVMIO, 0xcf, struct kvm_xsave) + #endif /* __LINUX_KVM_H */ -- 2.17.1
There are cases that malicious virtual machine can cause CPU stuck (due to event windows don't open up), e.g., infinite loop in microcode when nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and IRQ) can be delivered. It leads the CPU to be unavailable to host or other VMs. Notify VM exit is introduced to mitigate such kind of attacks, which will generate a VM exit if no event window occurs in VM non-root mode for a specified amount of time (notify window). A new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT is exposed to user space so that the user can query the capability and set the expected notify window when creating VMs. If notify VM exit happens with VM_INVALID_CONTEXT, hypervisor should exit to user space with the exit reason KVM_EXIT_NOTIFY to inform the fatal case. Then user space can inject a SHUTDOWN event to the target vcpu. This is implemented by defining a new bit in flags field of kvm_vcpu_event in KVM_SET_VCPU_EVENTS ioctl. Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> --- hw/i386/x86.c | 24 ++++++++++++++++++ include/hw/i386/x86.h | 3 +++ target/i386/kvm/kvm.c | 58 ++++++++++++++++++++++++++++--------------- 3 files changed, 65 insertions(+), 20 deletions(-) diff --git a/hw/i386/x86.c b/hw/i386/x86.c index XXXXXXX..XXXXXXX 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -XXX,XX +XXX,XX @@ static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name, qapi_free_SgxEPCList(list); } +static void x86_machine_get_notify_window(Object *obj, Visitor *v, + const char *name, void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + int32_t notify_window = x86ms->notify_window; + + visit_type_int32(v, name, ¬ify_window, errp); +} + +static void x86_machine_set_notify_window(Object *obj, Visitor *v, + const char *name, void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + visit_type_int32(v, name, &x86ms->notify_window, errp); +} + static void x86_machine_initfn(Object *obj) { X86MachineState *x86ms = X86_MACHINE(obj); @@ -XXX,XX +XXX,XX @@ static void x86_machine_initfn(Object *obj) x86ms->oem_id = g_strndup(ACPI_BUILD_APPNAME6, 6); x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); x86ms->bus_lock_ratelimit = 0; + x86ms->notify_window = -1; } static void x86_machine_class_init(ObjectClass *oc, void *data) @@ -XXX,XX +XXX,XX @@ static void x86_machine_class_init(ObjectClass *oc, void *data) NULL, NULL); object_class_property_set_description(oc, "sgx-epc", "SGX EPC device"); + + object_class_property_add(oc, X86_MACHINE_NOTIFY_WINDOW, "int32_t", + x86_machine_get_notify_window, + x86_machine_set_notify_window, NULL, NULL); + object_class_property_set_description(oc, X86_MACHINE_NOTIFY_WINDOW, + "Set the notify window required by notify VM exit"); } static const TypeInfo x86_machine_info = { diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h index XXXXXXX..XXXXXXX 100644 --- a/include/hw/i386/x86.h +++ b/include/hw/i386/x86.h @@ -XXX,XX +XXX,XX @@ struct X86MachineState { * which means no limitation on the guest's bus locks. */ uint64_t bus_lock_ratelimit; + + int32_t notify_window; }; #define X86_MACHINE_SMM "smm" @@ -XXX,XX +XXX,XX @@ struct X86MachineState { #define X86_MACHINE_OEM_ID "x-oem-id" #define X86_MACHINE_OEM_TABLE_ID "x-oem-table-id" #define X86_MACHINE_BUS_LOCK_RATELIMIT "bus-lock-ratelimit" +#define X86_MACHINE_NOTIFY_WINDOW "notify-window" #define TYPE_X86_MACHINE MACHINE_TYPE_NAME("x86") OBJECT_DECLARE_TYPE(X86MachineState, X86MachineClass, X86_MACHINE) diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index XXXXXXX..XXXXXXX 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -XXX,XX +XXX,XX @@ int kvm_arch_init(MachineState *ms, KVMState *s) int ret; struct utsname utsname; Error *local_err = NULL; + X86MachineState *x86ms; + + assert(object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)); + x86ms = X86_MACHINE(ms); /* * Initialize SEV context, if required @@ -XXX,XX +XXX,XX @@ int kvm_arch_init(MachineState *ms, KVMState *s) } if (kvm_check_extension(s, KVM_CAP_X86_SMM) && - object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE) && - x86_machine_is_smm_enabled(X86_MACHINE(ms))) { + x86_machine_is_smm_enabled(x86ms)) { smram_machine_done.notify = register_smram_listener; qemu_add_machine_init_done_notifier(&smram_machine_done); } @@ -XXX,XX +XXX,XX @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } - if (object_dynamic_cast(OBJECT(ms), TYPE_X86_MACHINE)) { - X86MachineState *x86ms = X86_MACHINE(ms); + if (x86ms->bus_lock_ratelimit > 0) { + ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT); + if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) { + error_report("kvm: bus lock detection unsupported"); + return -ENOTSUP; + } + ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0, + KVM_BUS_LOCK_DETECTION_EXIT); + if (ret < 0) { + error_report("kvm: Failed to enable bus lock detection cap: %s", + strerror(-ret)); + return ret; + } + ratelimit_init(&bus_lock_ratelimit_ctrl); + ratelimit_set_speed(&bus_lock_ratelimit_ctrl, + x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME); + } - if (x86ms->bus_lock_ratelimit > 0) { - ret = kvm_check_extension(s, KVM_CAP_X86_BUS_LOCK_EXIT); - if (!(ret & KVM_BUS_LOCK_DETECTION_EXIT)) { - error_report("kvm: bus lock detection unsupported"); - return -ENOTSUP; - } - ret = kvm_vm_enable_cap(s, KVM_CAP_X86_BUS_LOCK_EXIT, 0, - KVM_BUS_LOCK_DETECTION_EXIT); - if (ret < 0) { - error_report("kvm: Failed to enable bus lock detection cap: %s", - strerror(-ret)); - return ret; - } - ratelimit_init(&bus_lock_ratelimit_ctrl); - ratelimit_set_speed(&bus_lock_ratelimit_ctrl, - x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME); + if (kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) { + ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0, + x86ms->notify_window); + if (ret < 0) { + error_report("kvm: Failed to enable notify vmexit cap: %s", + strerror(-ret)); + return ret; } } @@ -XXX,XX +XXX,XX @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) X86CPU *cpu = X86_CPU(cs); uint64_t code; int ret; + struct kvm_vcpu_events events = {}; switch (run->exit_reason) { case KVM_EXIT_HLT: @@ -XXX,XX +XXX,XX @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) /* already handled in kvm_arch_post_run */ ret = 0; break; + case KVM_EXIT_NOTIFY: + ret = 0; + if (run->notify.data & KVM_NOTIFY_CONTEXT_INVALID) { + warn_report("KVM: invalid context due to notify vmexit"); + events.flags |= KVM_VCPUEVENT_SHUTDOWN; + ret = kvm_vcpu_ioctl(cs, KVM_SET_VCPU_EVENTS, &events); + } + break; default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); ret = -1; -- 2.17.1
Notify VM exit is introduced to mitigate the potential DOS attach from malicious VM. This series is the userspace part to enable this feature through a new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT. The detailed info can be seen in Patch 2. The corresponding KVM support can be found in linux 6.0-rc: (2f4073e08f4c KVM: VMX: Enable Notify VM exit) This patch set depends on some definition which can be updated from scripts/update-linux-headers.sh. The corresponding separate patch set is available at: https://lists.gnu.org/archive/html/qemu-devel/2022-09/msg02102.html --- Change logs: v6 -> v7 - Add a warning message when exiting to userspace (Peter Xu) - v6: https://lore.kernel.org/all/20220915092839.5518-1-chenyi.qiang@intel.com/ v5 -> v6 - Add some info related to the valid range of notify_window in patch 2. (Peter Xu) - Add the doc in qemu-options.hx. (Peter Xu) - v5: https://lore.kernel.org/qemu-devel/20220817020845.21855-1-chenyi.qiang@intel.com/ v4 -> v5 - Remove the assert check to avoid the nop in NDEBUG case. (Yuan) - v4: https://lore.kernel.org/qemu-devel/20220524140302.23272-1-chenyi.qiang@intel.com/ v3 -> v4 - Add a new KVM cap KVM_CAP_TRIPLE_FAULT_EVENT to guard the extension of triple fault event save&restore. - v3: https://lore.kernel.org/qemu-devel/20220421074028.18196-1-chenyi.qiang@intel.com/ --- Chenyi Qiang (2): i386: kvm: extend kvm_{get, put}_vcpu_events to support pending triple fault i386: Add notify VM exit support hw/i386/x86.c | 45 ++++++++++++++++++++++++++++++++++++ include/hw/i386/x86.h | 5 ++++ qemu-options.hx | 10 +++++++- target/i386/cpu.c | 1 + target/i386/cpu.h | 1 + target/i386/kvm/kvm.c | 54 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 115 insertions(+), 1 deletion(-) -- 2.17.1
For the direct triple faults, i.e. hardware detected and KVM morphed to VM-Exit, KVM will never lose them. But for triple faults sythesized by KVM, e.g. the RSM path, if KVM exits to userspace before the request is serviced, userspace could migrate the VM and lose the triple fault. A new flag KVM_VCPUEVENT_VALID_TRIPLE_FAULT is defined to signal that the event.triple_fault_pending field contains a valid state if the KVM_CAP_X86_TRIPLE_FAULT_EVENT capability is enabled. Acked-by: Peter Xu <peterx@redhat.com> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> --- target/i386/cpu.c | 1 + target/i386/cpu.h | 1 + target/i386/kvm/kvm.c | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index XXXXXXX..XXXXXXX 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -XXX,XX +XXX,XX @@ static void x86_cpu_reset(DeviceState *dev) env->exception_has_payload = false; env->exception_payload = 0; env->nmi_injected = false; + env->triple_fault_pending = false; #if !defined(CONFIG_USER_ONLY) /* We hard-wire the BSP to the first CPU. */ apic_designate_bsp(cpu->apic_state, s->cpu_index == 0); diff --git a/target/i386/cpu.h b/target/i386/cpu.h index XXXXXXX..XXXXXXX 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -XXX,XX +XXX,XX @@ typedef struct CPUArchState { uint8_t has_error_code; uint8_t exception_has_payload; uint64_t exception_payload; + bool triple_fault_pending; uint32_t ins_len; uint32_t sipi_vector; bool tsc_valid; diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index XXXXXXX..XXXXXXX 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -XXX,XX +XXX,XX @@ static int has_xcrs; static int has_pit_state2; static int has_sregs2; static int has_exception_payload; +static int has_triple_fault_event; static bool has_msr_mcg_ext_ctl; @@ -XXX,XX +XXX,XX @@ int kvm_arch_init(MachineState *ms, KVMState *s) } } + has_triple_fault_event = kvm_check_extension(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT); + if (has_triple_fault_event) { + ret = kvm_vm_enable_cap(s, KVM_CAP_X86_TRIPLE_FAULT_EVENT, 0, true); + if (ret < 0) { + error_report("kvm: Failed to enable triple fault event cap: %s", + strerror(-ret)); + return ret; + } + } + ret = kvm_get_supported_msrs(s); if (ret < 0) { return ret; @@ -XXX,XX +XXX,XX @@ static int kvm_put_vcpu_events(X86CPU *cpu, int level) } } + if (has_triple_fault_event) { + events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + events.triple_fault.pending = env->triple_fault_pending; + } + return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_VCPU_EVENTS, &events); } @@ -XXX,XX +XXX,XX @@ static int kvm_get_vcpu_events(X86CPU *cpu) } } + if (events.flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { + env->triple_fault_pending = events.triple_fault.pending; + } + env->sipi_vector = events.sipi_vector; return 0; -- 2.17.1
There are cases that malicious virtual machine can cause CPU stuck (due to event windows don't open up), e.g., infinite loop in microcode when nested #AC (CVE-2015-5307). No event window means no event (NMI, SMI and IRQ) can be delivered. It leads the CPU to be unavailable to host or other VMs. Notify VM exit is introduced to mitigate such kind of attacks, which will generate a VM exit if no event window occurs in VM non-root mode for a specified amount of time (notify window). A new KVM capability KVM_CAP_X86_NOTIFY_VMEXIT is exposed to user space so that the user can query the capability and set the expected notify window when creating VMs. The format of the argument when enabling this capability is as follows: Bit 63:32 - notify window specified in qemu command Bit 31:0 - some flags (e.g. KVM_X86_NOTIFY_VMEXIT_ENABLED is set to enable the feature.) Because there are some concerns, e.g. a notify VM exit may happen with VM_CONTEXT_INVALID set in exit qualification (no cases are anticipated that would set this bit), which means VM context is corrupted. To avoid the false positive and a well-behaved guest gets killed, make this feature disabled by default. Users can enable the feature by a new machine property: qemu -machine notify_vmexit=on,notify_window=0 ... Note that notify_window is only valid when notify_vmexit is on. The valid range of notify_window is non-negative. It is even safe to set it to zero since there's an internal hardware threshold to be added to ensure no false positive. A new KVM exit reason KVM_EXIT_NOTIFY is defined for notify VM exit. If it happens with VM_INVALID_CONTEXT, hypervisor exits to user space to inform the fatal case. Then user space can inject a SHUTDOWN event to the target vcpu. This is implemented by injecting a sythesized triple fault event. Acked-by: Peter Xu <peterx@redhat.com> Signed-off-by: Chenyi Qiang <chenyi.qiang@intel.com> --- hw/i386/x86.c | 45 +++++++++++++++++++++++++++++++++++++++++++ include/hw/i386/x86.h | 5 +++++ qemu-options.hx | 10 +++++++++- target/i386/kvm/kvm.c | 34 ++++++++++++++++++++++++++++++++ 4 files changed, 93 insertions(+), 1 deletion(-) diff --git a/hw/i386/x86.c b/hw/i386/x86.c index XXXXXXX..XXXXXXX 100644 --- a/hw/i386/x86.c +++ b/hw/i386/x86.c @@ -XXX,XX +XXX,XX @@ static void machine_set_sgx_epc(Object *obj, Visitor *v, const char *name, qapi_free_SgxEPCList(list); } +static bool x86_machine_get_notify_vmexit(Object *obj, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + return x86ms->notify_vmexit; +} + +static void x86_machine_set_notify_vmexit(Object *obj, bool value, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + x86ms->notify_vmexit = value; +} + +static void x86_machine_get_notify_window(Object *obj, Visitor *v, + const char *name, void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + uint32_t notify_window = x86ms->notify_window; + + visit_type_uint32(v, name, ¬ify_window, errp); +} + +static void x86_machine_set_notify_window(Object *obj, Visitor *v, + const char *name, void *opaque, Error **errp) +{ + X86MachineState *x86ms = X86_MACHINE(obj); + + visit_type_uint32(v, name, &x86ms->notify_window, errp); +} + static void x86_machine_initfn(Object *obj) { X86MachineState *x86ms = X86_MACHINE(obj); @@ -XXX,XX +XXX,XX @@ static void x86_machine_initfn(Object *obj) x86ms->oem_table_id = g_strndup(ACPI_BUILD_APPNAME8, 8); x86ms->bus_lock_ratelimit = 0; x86ms->above_4g_mem_start = 4 * GiB; + x86ms->notify_vmexit = false; + x86ms->notify_window = 0; } static void x86_machine_class_init(ObjectClass *oc, void *data) @@ -XXX,XX +XXX,XX @@ static void x86_machine_class_init(ObjectClass *oc, void *data) NULL, NULL); object_class_property_set_description(oc, "sgx-epc", "SGX EPC device"); + + object_class_property_add(oc, X86_MACHINE_NOTIFY_WINDOW, "uint32_t", + x86_machine_get_notify_window, + x86_machine_set_notify_window, NULL, NULL); + object_class_property_set_description(oc, X86_MACHINE_NOTIFY_WINDOW, + "Set the notify window required by notify VM exit"); + + object_class_property_add_bool(oc, X86_MACHINE_NOTIFY_VMEXIT, + x86_machine_get_notify_vmexit, + x86_machine_set_notify_vmexit); + object_class_property_set_description(oc, X86_MACHINE_NOTIFY_VMEXIT, + "Enable notify VM exit"); } static const TypeInfo x86_machine_info = { diff --git a/include/hw/i386/x86.h b/include/hw/i386/x86.h index XXXXXXX..XXXXXXX 100644 --- a/include/hw/i386/x86.h +++ b/include/hw/i386/x86.h @@ -XXX,XX +XXX,XX @@ struct X86MachineState { * which means no limitation on the guest's bus locks. */ uint64_t bus_lock_ratelimit; + + bool notify_vmexit; + uint32_t notify_window; }; #define X86_MACHINE_SMM "smm" @@ -XXX,XX +XXX,XX @@ struct X86MachineState { #define X86_MACHINE_OEM_ID "x-oem-id" #define X86_MACHINE_OEM_TABLE_ID "x-oem-table-id" #define X86_MACHINE_BUS_LOCK_RATELIMIT "bus-lock-ratelimit" +#define X86_MACHINE_NOTIFY_VMEXIT "notify-vmexit" +#define X86_MACHINE_NOTIFY_WINDOW "notify-window" #define TYPE_X86_MACHINE MACHINE_TYPE_NAME("x86") OBJECT_DECLARE_TYPE(X86MachineState, X86MachineClass, X86_MACHINE) diff --git a/qemu-options.hx b/qemu-options.hx index XXXXXXX..XXXXXXX 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -XXX,XX +XXX,XX @@ DEF("machine", HAS_ARG, QEMU_OPTION_machine, \ " memory-encryption=@var{} memory encryption object to use (default=none)\n" " hmat=on|off controls ACPI HMAT support (default=off)\n" " memory-backend='backend-id' specifies explicitly provided backend for main RAM (default=none)\n" - " cxl-fmw.0.targets.0=firsttarget,cxl-fmw.0.targets.1=secondtarget,cxl-fmw.0.size=size[,cxl-fmw.0.interleave-granularity=granularity]\n", + " cxl-fmw.0.targets.0=firsttarget,cxl-fmw.0.targets.1=secondtarget,cxl-fmw.0.size=size[,cxl-fmw.0.interleave-granularity=granularity]\n" + " notify_vmexit=on|off,notify_window=n controls notify VM exit support (default=off) and specifies the notify window size (default=0)\n", QEMU_ARCH_ALL) SRST ``-machine [type=]name[,prop=value[,...]]`` @@ -XXX,XX +XXX,XX @@ SRST :: -machine cxl-fmw.0.targets.0=cxl.0,cxl-fmw.0.targets.1=cxl.1,cxl-fmw.0.size=128G,cxl-fmw.0.interleave-granularity=512k + + ``notify_vmexit=on|off,notify_window=n`` + Enables or disables Notify VM exit support on x86 host and specify + the corresponding notify window to trigger the VM exit if enabled. + This feature can mitigate the CPU stuck issue due to event windows + don't open up for a specified of time (notify window). + The default is off. ERST DEF("M", HAS_ARG, QEMU_OPTION_M, diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c index XXXXXXX..XXXXXXX 100644 --- a/target/i386/kvm/kvm.c +++ b/target/i386/kvm/kvm.c @@ -XXX,XX +XXX,XX @@ int kvm_arch_init(MachineState *ms, KVMState *s) ratelimit_set_speed(&bus_lock_ratelimit_ctrl, x86ms->bus_lock_ratelimit, BUS_LOCK_SLICE_TIME); } + + if (x86ms->notify_vmexit && + kvm_check_extension(s, KVM_CAP_X86_NOTIFY_VMEXIT)) { + uint64_t notify_window_flags = + ((uint64_t)x86ms->notify_window << 32) | + KVM_X86_NOTIFY_VMEXIT_ENABLED | + KVM_X86_NOTIFY_VMEXIT_USER; + ret = kvm_vm_enable_cap(s, KVM_CAP_X86_NOTIFY_VMEXIT, 0, + notify_window_flags); + if (ret < 0) { + error_report("kvm: Failed to enable notify vmexit cap: %s", + strerror(-ret)); + return ret; + } + } } return 0; @@ -XXX,XX +XXX,XX @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) X86CPU *cpu = X86_CPU(cs); uint64_t code; int ret; + struct kvm_vcpu_events events = {}; + bool ctx_invalid; switch (run->exit_reason) { case KVM_EXIT_HLT: @@ -XXX,XX +XXX,XX @@ int kvm_arch_handle_exit(CPUState *cs, struct kvm_run *run) /* already handled in kvm_arch_post_run */ ret = 0; break; + case KVM_EXIT_NOTIFY: + ctx_invalid = !!(run->notify.flags & KVM_NOTIFY_CONTEXT_INVALID); + ret = 0; + warn_report_once("KVM: encounter a notify exit with %svalid context in" + " guest. It means there can be possible misbehaves in" + " guest, please have a look.", + ctx_invalid ? "in" : ""); + if (ctx_invalid) { + if (has_triple_fault_event) { + events.flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; + events.triple_fault.pending = true; + ret = kvm_vcpu_ioctl(cs, KVM_SET_VCPU_EVENTS, &events); + } else { + ret = -1; + } + } + break; default: fprintf(stderr, "KVM: unknown exit reason %d\n", run->exit_reason); ret = -1; -- 2.17.1