From: Isaku Yamahata <isaku.yamahata@intel.com>
The next step of TDX guest creation is to create vcpu. Allocate TDX vcpu
structures, partially initialize it. Allocate pages of TDX vcpu for the
TDX module. Actual donation TDX vcpu pages to the TDX module is not done
yet.
In the case of the conventional case, cpuid is empty at the initialization.
and cpuid is configured after the vcpu initialization. Because TDX
supports only X2APIC mode, cpuid is forcibly initialized to support X2APIC
on the vcpu initialization.
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
Changes v10 -> v11:
- NULL check of kvmalloc_array() in tdx_vcpu_reset. Move it to
tdx_vcpu_create()
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
arch/x86/kvm/vmx/main.c | 40 ++++++++++++++++++--
arch/x86/kvm/vmx/tdx.c | 75 ++++++++++++++++++++++++++++++++++++++
arch/x86/kvm/vmx/x86_ops.h | 10 +++++
arch/x86/kvm/x86.c | 2 +
4 files changed, 123 insertions(+), 4 deletions(-)
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index ddf0742f1f67..59813ca05f36 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -63,6 +63,38 @@ static void vt_vm_free(struct kvm *kvm)
tdx_vm_free(kvm);
}
+static int vt_vcpu_precreate(struct kvm *kvm)
+{
+ if (is_td(kvm))
+ return 0;
+
+ return vmx_vcpu_precreate(kvm);
+}
+
+static int vt_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_create(vcpu);
+
+ return vmx_vcpu_create(vcpu);
+}
+
+static void vt_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_free(vcpu);
+
+ return vmx_vcpu_free(vcpu);
+}
+
+static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ if (is_td_vcpu(vcpu))
+ return tdx_vcpu_reset(vcpu, init_event);
+
+ return vmx_vcpu_reset(vcpu, init_event);
+}
+
static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
{
if (!is_td(kvm))
@@ -90,10 +122,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.vm_destroy = vt_vm_destroy,
.vm_free = vt_vm_free,
- .vcpu_precreate = vmx_vcpu_precreate,
- .vcpu_create = vmx_vcpu_create,
- .vcpu_free = vmx_vcpu_free,
- .vcpu_reset = vmx_vcpu_reset,
+ .vcpu_precreate = vt_vcpu_precreate,
+ .vcpu_create = vt_vcpu_create,
+ .vcpu_free = vt_vcpu_free,
+ .vcpu_reset = vt_vcpu_reset,
.prepare_switch_to_guest = vmx_prepare_switch_to_guest,
.vcpu_load = vmx_vcpu_load,
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 557a609c5147..099f0737a5aa 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -281,6 +281,81 @@ int tdx_vm_init(struct kvm *kvm)
return 0;
}
+int tdx_vcpu_create(struct kvm_vcpu *vcpu)
+{
+ struct kvm_cpuid_entry2 *e;
+
+ /*
+ * On cpu creation, cpuid entry is blank. Forcibly enable
+ * X2APIC feature to allow X2APIC.
+ * Because vcpu_reset() can't return error, allocation is done here.
+ */
+ WARN_ON_ONCE(vcpu->arch.cpuid_entries);
+ WARN_ON_ONCE(vcpu->arch.cpuid_nent);
+ e = kvmalloc_array(1, sizeof(*e), GFP_KERNEL_ACCOUNT);
+ if (!e)
+ return -ENOMEM;
+ *e = (struct kvm_cpuid_entry2) {
+ .function = 1, /* Features for X2APIC */
+ .index = 0,
+ .eax = 0,
+ .ebx = 0,
+ .ecx = 1ULL << 21, /* X2APIC */
+ .edx = 0,
+ };
+ vcpu->arch.cpuid_entries = e;
+ vcpu->arch.cpuid_nent = 1;
+
+ /* TDX only supports x2APIC, which requires an in-kernel local APIC. */
+ if (!vcpu->arch.apic)
+ return -EINVAL;
+
+ fpstate_set_confidential(&vcpu->arch.guest_fpu);
+
+ vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX;
+
+ vcpu->arch.cr0_guest_owned_bits = -1ul;
+ vcpu->arch.cr4_guest_owned_bits = -1ul;
+
+ vcpu->arch.tsc_offset = to_kvm_tdx(vcpu->kvm)->tsc_offset;
+ vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset;
+ vcpu->arch.guest_state_protected =
+ !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTRIBUTE_DEBUG);
+
+ return 0;
+}
+
+void tdx_vcpu_free(struct kvm_vcpu *vcpu)
+{
+ /* This is stub for now. More logic will come. */
+}
+
+void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
+{
+ struct msr_data apic_base_msr;
+
+ /* TDX doesn't support INIT event. */
+ if (WARN_ON_ONCE(init_event))
+ goto td_bugged;
+
+ /* TDX rquires X2APIC. */
+ apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC;
+ if (kvm_vcpu_is_reset_bsp(vcpu))
+ apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
+ apic_base_msr.host_initiated = true;
+ if (WARN_ON_ONCE(kvm_set_apic_base(vcpu, &apic_base_msr)))
+ goto td_bugged;
+
+ /*
+ * Don't update mp_state to runnable because more initialization
+ * is needed by TDX_VCPU_INIT.
+ */
+ return;
+
+td_bugged:
+ vcpu->kvm->vm_bugged = true;
+}
+
int tdx_dev_ioctl(void __user *argp)
{
struct kvm_tdx_capabilities __user *user_caps;
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index 6c40dda1cc2f..37ab2cfd35bc 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -147,7 +147,12 @@ int tdx_offline_cpu(void);
int tdx_vm_init(struct kvm *kvm);
void tdx_mmu_release_hkid(struct kvm *kvm);
void tdx_vm_free(struct kvm *kvm);
+
int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
+
+int tdx_vcpu_create(struct kvm_vcpu *vcpu);
+void tdx_vcpu_free(struct kvm_vcpu *vcpu);
+void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
#else
static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return 0; }
static inline void tdx_hardware_unsetup(void) {}
@@ -159,7 +164,12 @@ static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; }
static inline void tdx_mmu_release_hkid(struct kvm *kvm) {}
static inline void tdx_flush_shadow_all_private(struct kvm *kvm) {}
static inline void tdx_vm_free(struct kvm *kvm) {}
+
static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; }
+
+static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
+static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
+static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
#endif
#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1fb135e0c98f..e8bc66031a1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -492,6 +492,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
kvm_recalculate_apic_map(vcpu->kvm);
return 0;
}
+EXPORT_SYMBOL_GPL(kvm_set_apic_base);
/*
* Handle a fault on a hardware virtualization (VMX or SVM) instruction.
@@ -12109,6 +12110,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
{
--
2.25.1
On Thu, 2023-01-12 at 08:31 -0800, isaku.yamahata@intel.com wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > > The next step of TDX guest creation is to create vcpu. Allocate TDX vcpu > structures, partially initialize it. > Why partially initialize it? Shouldn't a better way be either: 1) not initialize at all, or; 2) fully initialize? Can you put more _why_ here? > Allocate pages of TDX vcpu for the > TDX module. Actual donation TDX vcpu pages to the TDX module is not done > yet. Also, can you explain _why_ it is not done here? > > In the case of the conventional case, cpuid is empty at the initialization. > and cpuid is configured after the vcpu initialization. Because TDX > supports only X2APIC mode, cpuid is forcibly initialized to support X2APIC > on the vcpu initialization. Don't quite understand here. As you said CPUID entries are configured later in KVM_SET_CPUID2, so what's the point of initializing CPUID to support x2apic here? Are you suggesting KVM_SET_CPUID2 will be somehow rejected for TDX guest, or there will be special handling to make sure the CPUID initialized here won't be overwritten later? Please explain clearly here. > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > --- > Changes v10 -> v11: > - NULL check of kvmalloc_array() in tdx_vcpu_reset. Move it to > tdx_vcpu_create() > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > --- > arch/x86/kvm/vmx/main.c | 40 ++++++++++++++++++-- > arch/x86/kvm/vmx/tdx.c | 75 ++++++++++++++++++++++++++++++++++++++ > arch/x86/kvm/vmx/x86_ops.h | 10 +++++ > arch/x86/kvm/x86.c | 2 + > 4 files changed, 123 insertions(+), 4 deletions(-) > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c > index ddf0742f1f67..59813ca05f36 100644 > --- a/arch/x86/kvm/vmx/main.c > +++ b/arch/x86/kvm/vmx/main.c > @@ -63,6 +63,38 @@ static void vt_vm_free(struct kvm *kvm) > tdx_vm_free(kvm); > } > > +static int vt_vcpu_precreate(struct kvm *kvm) > +{ > + if (is_td(kvm)) > + return 0; > + > + return vmx_vcpu_precreate(kvm); > +} > + > +static int vt_vcpu_create(struct kvm_vcpu *vcpu) > +{ > + if (is_td_vcpu(vcpu)) > + return tdx_vcpu_create(vcpu); > + > + return vmx_vcpu_create(vcpu); > +} > + > +static void vt_vcpu_free(struct kvm_vcpu *vcpu) > +{ > + if (is_td_vcpu(vcpu)) > + return tdx_vcpu_free(vcpu); > + > + return vmx_vcpu_free(vcpu); > +} > + > +static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > +{ > + if (is_td_vcpu(vcpu)) > + return tdx_vcpu_reset(vcpu, init_event); > + > + return vmx_vcpu_reset(vcpu, init_event); > +} > + > static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) > { > if (!is_td(kvm)) > @@ -90,10 +122,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = { > .vm_destroy = vt_vm_destroy, > .vm_free = vt_vm_free, > > - .vcpu_precreate = vmx_vcpu_precreate, > - .vcpu_create = vmx_vcpu_create, > - .vcpu_free = vmx_vcpu_free, > - .vcpu_reset = vmx_vcpu_reset, > + .vcpu_precreate = vt_vcpu_precreate, > + .vcpu_create = vt_vcpu_create, > + .vcpu_free = vt_vcpu_free, > + .vcpu_reset = vt_vcpu_reset, > > .prepare_switch_to_guest = vmx_prepare_switch_to_guest, > .vcpu_load = vmx_vcpu_load, > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index 557a609c5147..099f0737a5aa 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -281,6 +281,81 @@ int tdx_vm_init(struct kvm *kvm) > return 0; > } > > +int tdx_vcpu_create(struct kvm_vcpu *vcpu) > +{ > + struct kvm_cpuid_entry2 *e; > + > + /* > + * On cpu creation, cpuid entry is blank. Forcibly enable > + * X2APIC feature to allow X2APIC. > + * Because vcpu_reset() can't return error, allocation is done here. > + */ > + WARN_ON_ONCE(vcpu->arch.cpuid_entries); > + WARN_ON_ONCE(vcpu->arch.cpuid_nent); > + e = kvmalloc_array(1, sizeof(*e), GFP_KERNEL_ACCOUNT); You don't need to use kvmalloc_array() when only allocating one entry. > + if (!e) > + return -ENOMEM; > + *e = (struct kvm_cpuid_entry2) { > + .function = 1, /* Features for X2APIC */ > + .index = 0, > + .eax = 0, > + .ebx = 0, > + .ecx = 1ULL << 21, /* X2APIC */ > + .edx = 0, > + }; > + vcpu->arch.cpuid_entries = e; > + vcpu->arch.cpuid_nent = 1; As mentioned above, why doing it here? Won't be this be overwritten later in KVM_SET_CPUID2? > + > + /* TDX only supports x2APIC, which requires an in-kernel local APIC. */ > + if (!vcpu->arch.apic) > + return -EINVAL; If this is hit, what happens to the CPUID entry allocated above? It's absolutely not clear here in this patch. > + > + fpstate_set_confidential(&vcpu->arch.guest_fpu); > + > + vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; > + > + vcpu->arch.cr0_guest_owned_bits = -1ul; > + vcpu->arch.cr4_guest_owned_bits = -1ul; > + > + vcpu->arch.tsc_offset = to_kvm_tdx(vcpu->kvm)->tsc_offset; > + vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; > + vcpu->arch.guest_state_protected = > + !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTRIBUTE_DEBUG); > + > + return 0; > +} > + > +void tdx_vcpu_free(struct kvm_vcpu *vcpu) > +{ > + /* This is stub for now. More logic will come. */ > +} > + > +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > +{ > + struct msr_data apic_base_msr; > + > + /* TDX doesn't support INIT event. */ > + if (WARN_ON_ONCE(init_event)) > + goto td_bugged; Should we use KVM_BUG_ON()? Again, it appears this depends on how KVM handles INIT, which is done in a later patch far way: [PATCH v11 102/113] KVM: TDX: Silently ignore INIT/SIPI And there's no material explaining how it is handled in either changelog or comment, so to me it's not reviewable. > + > + /* TDX rquires X2APIC. */ > + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC; > + if (kvm_vcpu_is_reset_bsp(vcpu)) > + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; > + apic_base_msr.host_initiated = true; > + if (WARN_ON_ONCE(kvm_set_apic_base(vcpu, &apic_base_msr))) > + goto td_bugged; I think we have KVM_BUG_ON()? TDX requires a lot more staff then just x2apic, why only x2apic is done here, particularly in _this_ patch? > + > + /* > + * Don't update mp_state to runnable because more initialization > + * is needed by TDX_VCPU_INIT. > + */ > + return; > + > +td_bugged: > + vcpu->kvm->vm_bugged = true; > +} > + > [snip]
On Thu, Jan 19, 2023 at 12:45:09AM +0000, "Huang, Kai" <kai.huang@intel.com> wrote: > On Thu, 2023-01-12 at 08:31 -0800, isaku.yamahata@intel.com wrote: > > From: Isaku Yamahata <isaku.yamahata@intel.com> > > > > The next step of TDX guest creation is to create vcpu. Allocate TDX vcpu > > structures, partially initialize it. > > > > Why partially initialize it? Shouldn't a better way be either: 1) not > initialize at all, or; 2) fully initialize? > > Can you put more _why_ here? > > > > Allocate pages of TDX vcpu for the > > TDX module. Actual donation TDX vcpu pages to the TDX module is not done > > yet. > > Also, can you explain _why_ it is not done here? > > > > > In the case of the conventional case, cpuid is empty at the initialization. > > and cpuid is configured after the vcpu initialization. Because TDX > > supports only X2APIC mode, cpuid is forcibly initialized to support X2APIC > > on the vcpu initialization. > > Don't quite understand here. As you said CPUID entries are configured later in > KVM_SET_CPUID2, so what's the point of initializing CPUID to support x2api> Are you suggesting KVM_SET_CPUID2 will be somehow rejected for TDX guest, or > there will be special handling to make sure the CPUID initialized here won't be > overwritten later? > > Please explain clearly here. Here is the updated one. The next step of TDX guest creation is to create vcpu. Allocate TDX vcpu structures, initialize it that doesn't require TDX SEAMCALL. TDX specific vcpuid initialization will be implemented as independent KVM_TDX_INIT_VCPU so that when error occurs it's easy to determine which component has the issue, KVM or TDX. In the case of the conventional case, cpuid is empty at the initialization. and cpuid is configured after the vcpu initialization. Because TDX supports only X2APIC mode, cpuid is forcibly set to support X2APIC and APIC BASE MSR is forcibly set to X2APIC mode. The MSR will be read only for guest TD. Because kvm_arch_vcpu_create() also initializes kvm MMU that depends on local apic settings. So x2apic needs to be initialized to X2APIC mode by vcpu_reset method before KVM mmu initialization. > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > > index 557a609c5147..099f0737a5aa 100644 > > --- a/arch/x86/kvm/vmx/tdx.c > > +++ b/arch/x86/kvm/vmx/tdx.c > > @@ -281,6 +281,81 @@ int tdx_vm_init(struct kvm *kvm) > > return 0; > > } > > > > +int tdx_vcpu_create(struct kvm_vcpu *vcpu) > > +{ > > + struct kvm_cpuid_entry2 *e; > > + > > + /* > > + * On cpu creation, cpuid entry is blank. Forcibly enable > > + * X2APIC feature to allow X2APIC. > > + * Because vcpu_reset() can't return error, allocation is done here. > > + */ > > + WARN_ON_ONCE(vcpu->arch.cpuid_entries); > > + WARN_ON_ONCE(vcpu->arch.cpuid_nent); > > + e = kvmalloc_array(1, sizeof(*e), GFP_KERNEL_ACCOUNT); > > You don't need to use kvmalloc_array() when only allocating one entry. I'll make it kvmalloc() and add comment on kvmalloc(), not kmalloc(). > > + if (!e) > > + return -ENOMEM; > > + *e = (struct kvm_cpuid_entry2) { > > + .function = 1, /* Features for X2APIC */ > > + .index = 0, > > + .eax = 0, > > + .ebx = 0, > > + .ecx = 1ULL << 21, /* X2APIC */ > > + .edx = 0, > > + }; > > + vcpu->arch.cpuid_entries = e; > > + vcpu->arch.cpuid_nent = 1; > > As mentioned above, why doing it here? Won't be this be overwritten later in > KVM_SET_CPUID2? Yes, user space VMM can overwrite cpuid[0x1] and APIC base MSR. But it doesn't matter because it's a bug of user space VMM. user space VMM has to keep the consistency of cpuid and MSRs. Because TDX module virtualizes cpuid[0x1].x2apic to fixed 1, KVM value doesn't matter after vcpu creation. Because KVM virtualizes APIC base as read only to guest, cpuid[0x1].x2apic doesn't matter after vcpu creation as long as user space VMM keeps KVM APIC BASE value. I'll add a comment. > > + > > + /* TDX only supports x2APIC, which requires an in-kernel local APIC. */ > > + if (!vcpu->arch.apic) > > + return -EINVAL; > > If this is hit, what happens to the CPUID entry allocated above? It's > absolutely not clear here in this patch. It's memory leak. I'll move the check before memory allocation. > > + > > + fpstate_set_confidential(&vcpu->arch.guest_fpu); > > + > > + vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; > > + > > + vcpu->arch.cr0_guest_owned_bits = -1ul; > > + vcpu->arch.cr4_guest_owned_bits = -1ul; > > + > > + vcpu->arch.tsc_offset = to_kvm_tdx(vcpu->kvm)->tsc_offset; > > + vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; > > + vcpu->arch.guest_state_protected = > > + !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTRIBUTE_DEBUG); > > + > > + return 0; > > +} > > + > > +void tdx_vcpu_free(struct kvm_vcpu *vcpu) > > +{ > > + /* This is stub for now. More logic will come. */ > > +} > > + > > +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > > +{ > > + struct msr_data apic_base_msr; > > + > > + /* TDX doesn't support INIT event. */ > > + if (WARN_ON_ONCE(init_event)) > > + goto td_bugged; > > Should we use KVM_BUG_ON()? > > Again, it appears this depends on how KVM handles INIT, which is done in a later > patch far way: > > [PATCH v11 102/113] KVM: TDX: Silently ignore INIT/SIPI > > And there's no material explaining how it is handled in either changelog or > comment, so to me it's not reviewable. I'll convert them to KVM_BUG_ON(). With this patch, I'll remove WARN_ON_ONCE() and add KVM_BUG_ON() with the later patch. > > + > > + /* TDX rquires X2APIC. */ > > + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC; > > + if (kvm_vcpu_is_reset_bsp(vcpu)) > > + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; > > + apic_base_msr.host_initiated = true; > > + if (WARN_ON_ONCE(kvm_set_apic_base(vcpu, &apic_base_msr))) > > + goto td_bugged; > > I think we have KVM_BUG_ON()? > > TDX requires a lot more staff then just x2apic, why only x2apic is done here, > particularly in _this_ patch? After this callback, kvm mmu initialization follows. It depends on apic setting. X2APIC is only devication from the common logic kvm_lapic_reset(). I'll add a comment. -- Isaku Yamahata <isaku.yamahata@gmail.com>
On Tue, 2023-02-28 at 03:06 -0800, Isaku Yamahata wrote: > > > + if (!e) > > > + return -ENOMEM; > > > + *e = (struct kvm_cpuid_entry2) { > > > + .function = 1, /* Features for X2APIC */ > > > + .index = 0, > > > + .eax = 0, > > > + .ebx = 0, > > > + .ecx = 1ULL << 21, /* X2APIC */ > > > + .edx = 0, > > > + }; > > > + vcpu->arch.cpuid_entries = e; > > > + vcpu->arch.cpuid_nent = 1; > > > > As mentioned above, why doing it here? Won't be this be overwritten later in > > KVM_SET_CPUID2? > > Yes, user space VMM can overwrite cpuid[0x1] and APIC base MSR. But it > doesn't > matter because it's a bug of user space VMM. user space VMM has to keep the > consistency of cpuid and MSRs. > Because TDX module virtualizes cpuid[0x1].x2apic to fixed 1, KVM value doesn't > matter after vcpu creation. > Because KVM virtualizes APIC base as read only to guest, cpuid[0x1].x2apic > doesn't matter after vcpu creation as long as user space VMM keeps KVM APIC > BASE > value. > Contrary, can we depend on userspace VMM to set x2APIC in CPUID, but not do this in KVM? If userspace doesn't do it, we treat it as userspace's bug. Plus, userspace anyway needs to set x2APIC in CPUID regardless whether you have done above here, correct? I don't see the point of doing above in KVM because you are neither enforcing anything in KVM, nor you are reducing effort of userspace.
On Tue, Feb 28, 2023 at 11:52:59AM +0000, "Huang, Kai" <kai.huang@intel.com> wrote: > On Tue, 2023-02-28 at 03:06 -0800, Isaku Yamahata wrote: > > > > + if (!e) > > > > + return -ENOMEM; > > > > + *e = (struct kvm_cpuid_entry2) { > > > > + .function = 1, /* Features for X2APIC */ > > > > + .index = 0, > > > > + .eax = 0, > > > > + .ebx = 0, > > > > + .ecx = 1ULL << 21, /* X2APIC */ > > > > + .edx = 0, > > > > + }; > > > > + vcpu->arch.cpuid_entries = e; > > > > + vcpu->arch.cpuid_nent = 1; > > > > > > As mentioned above, why doing it here? Won't be this be overwritten later in > > > KVM_SET_CPUID2? > > > > Yes, user space VMM can overwrite cpuid[0x1] and APIC base MSR. But it > > doesn't > > matter because it's a bug of user space VMM. user space VMM has to keep the > > consistency of cpuid and MSRs. > > Because TDX module virtualizes cpuid[0x1].x2apic to fixed 1, KVM value doesn't > > matter after vcpu creation. > > Because KVM virtualizes APIC base as read only to guest, cpuid[0x1].x2apic > > doesn't matter after vcpu creation as long as user space VMM keeps KVM APIC > > BASE > > value. > > > > Contrary, can we depend on userspace VMM to set x2APIC in CPUID, but not do this > in KVM? If userspace doesn't do it, we treat it as userspace's bug. > > Plus, userspace anyway needs to set x2APIC in CPUID regardless whether you have > done above here, correct? > > I don't see the point of doing above in KVM because you are neither enforcing > anything in KVM, nor you are reducing effort of userspace. Good idea. I can drop cpuid part from tdx_vcpu_create() and apic base part from tdx_vcpu_reset(). It needs to modify tdx_has_emulated_msr() to allow user space VMM to update APIC BASE MSR. diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c index e045a8132639..46c82ce3ef46 100644 --- a/arch/x86/kvm/vmx/tdx.c +++ b/arch/x86/kvm/vmx/tdx.c @@ -2624,7 +2624,14 @@ int tdx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) int tdx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) { - if (tdx_has_emulated_msr(msr->index, true)) + if (tdx_has_emulated_msr(msr->index, true) || + /* + * user space VMM should explicitly set to X2APIC mode as initial + * value that is deviated from the conventional case. + */ + (msr->host_initiated && msr->index == MSR_IA32_APICBASE && + (msr->data & ~MSR_IA32_APICBASE_BSP) == + (APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC))) return kvm_set_msr_common(vcpu, msr); return 1; } Just FYI, qemu needs the following change. --- a/target/i386/kvm/tdx.c +++ b/target/i386/kvm/tdx.c @@ -274,6 +274,11 @@ bool is_tdx_vm(void) return !!tdx_guest; } +bool is_tdx_vm_finalized(void) +{ + return !!tdx_guest && tdx_guest->parent_obj.ready; +} + static inline uint32_t host_cpuid_reg(uint32_t function, uint32_t index, int reg) { @@ -875,10 +880,20 @@ static void tdx_post_init_vcpus(void) TdxFirmwareEntry *hob; CPUState *cpu; int r; + uint64_t apic_base; hob = tdx_get_hob_entry(tdx_guest); CPU_FOREACH(cpu) { - apic_force_x2apic(X86_CPU(cpu)->apic_state); + X86CPU *x86_cpu = X86_CPU(cpu); + + apic_force_x2apic(x86_cpu->apic_state); + + apic_base = APIC_DEFAULT_ADDRESS | MSR_IA32_APICBASE_ENABLE | + MSR_IA32_APICBASE_EXTD; + if (cpu_is_bsp(x86_cpu)) + apic_base |= MSR_IA32_APICBASE_BSP; + cpu_set_apic_base(x86_cpu->apic_state, apic_base); + kvm_put_apicbase(x86_cpu, apic_base); r = tdx_vcpu_ioctl(cpu, KVM_TDX_INIT_VCPU, 0, (void *)hob->address); if (r < 0) { diff --git a/target/i386/kvm/tdx.h b/target/i386/kvm/tdx.h index 5cc0f730afa6..f77689464738 100644 --- a/target/i386/kvm/tdx.h +++ b/target/i386/kvm/tdx.h @@ -59,8 +59,10 @@ typedef struct TdxGuest { #ifdef CONFIG_TDX bool is_tdx_vm(void); +bool is_tdx_vm_finalized(void); #else #define is_tdx_vm() 0 +#define is_tdx_vm_finalized() false; #endif /* CONFIG_TDX */ int tdx_kvm_init(MachineState *ms, Error **errp); -- Isaku Yamahata <isaku.yamahata@gmail.com>
On Tue, 2023-02-28 at 12:18 -0800, Isaku Yamahata wrote: > On Tue, Feb 28, 2023 at 11:52:59AM +0000, > "Huang, Kai" <kai.huang@intel.com> wrote: > > > On Tue, 2023-02-28 at 03:06 -0800, Isaku Yamahata wrote: > > > > > + if (!e) > > > > > + return -ENOMEM; > > > > > + *e = (struct kvm_cpuid_entry2) { > > > > > + .function = 1, /* Features for X2APIC */ > > > > > + .index = 0, > > > > > + .eax = 0, > > > > > + .ebx = 0, > > > > > + .ecx = 1ULL << 21, /* X2APIC */ > > > > > + .edx = 0, > > > > > + }; > > > > > + vcpu->arch.cpuid_entries = e; > > > > > + vcpu->arch.cpuid_nent = 1; > > > > > > > > As mentioned above, why doing it here? Won't be this be overwritten later in > > > > KVM_SET_CPUID2? > > > > > > Yes, user space VMM can overwrite cpuid[0x1] and APIC base MSR. But it > > > doesn't > > > matter because it's a bug of user space VMM. user space VMM has to keep the > > > consistency of cpuid and MSRs. > > > Because TDX module virtualizes cpuid[0x1].x2apic to fixed 1, KVM value doesn't > > > matter after vcpu creation. > > > Because KVM virtualizes APIC base as read only to guest, cpuid[0x1].x2apic > > > doesn't matter after vcpu creation as long as user space VMM keeps KVM APIC > > > BASE > > > value. > > > > > > > Contrary, can we depend on userspace VMM to set x2APIC in CPUID, but not do this > > in KVM? If userspace doesn't do it, we treat it as userspace's bug. > > > > Plus, userspace anyway needs to set x2APIC in CPUID regardless whether you have > > done above here, correct? > > > > I don't see the point of doing above in KVM because you are neither enforcing > > anything in KVM, nor you are reducing effort of userspace. > > Good idea. I can drop cpuid part from tdx_vcpu_create() and apic base part from > tdx_vcpu_reset(). It needs to modify tdx_has_emulated_msr() to allow user space > VMM to update APIC BASE MSR. My personal preference would be: 1) In KVM_SET_CPUID2, we do sanity check of CPUIDs provided by userspace, and return error if not met (i.e X2APIC isn't advertised). We already have cases that KVM_SET_CPUID2 can fail, so extending to do TDX-specific check seems reasonable to me too. 2) For APIC_BASE, you can just initialize the MSR in tdx_vcpu_reset() and ignore any update (+pr_warn()?) to MSR_IA32_APIC_BASE. But Sean may have different opinion especially for the CPUID part.
On Tue, Feb 28, 2023 at 09:49:10PM +0000, "Huang, Kai" <kai.huang@intel.com> wrote: > On Tue, 2023-02-28 at 12:18 -0800, Isaku Yamahata wrote: > > On Tue, Feb 28, 2023 at 11:52:59AM +0000, > > "Huang, Kai" <kai.huang@intel.com> wrote: > > > > > On Tue, 2023-02-28 at 03:06 -0800, Isaku Yamahata wrote: > > > > > > + if (!e) > > > > > > + return -ENOMEM; > > > > > > + *e = (struct kvm_cpuid_entry2) { > > > > > > + .function = 1, /* Features for X2APIC */ > > > > > > + .index = 0, > > > > > > + .eax = 0, > > > > > > + .ebx = 0, > > > > > > + .ecx = 1ULL << 21, /* X2APIC */ > > > > > > + .edx = 0, > > > > > > + }; > > > > > > + vcpu->arch.cpuid_entries = e; > > > > > > + vcpu->arch.cpuid_nent = 1; > > > > > > > > > > As mentioned above, why doing it here? Won't be this be overwritten later in > > > > > KVM_SET_CPUID2? > > > > > > > > Yes, user space VMM can overwrite cpuid[0x1] and APIC base MSR. But it > > > > doesn't > > > > matter because it's a bug of user space VMM. user space VMM has to keep the > > > > consistency of cpuid and MSRs. > > > > Because TDX module virtualizes cpuid[0x1].x2apic to fixed 1, KVM value doesn't > > > > matter after vcpu creation. > > > > Because KVM virtualizes APIC base as read only to guest, cpuid[0x1].x2apic > > > > doesn't matter after vcpu creation as long as user space VMM keeps KVM APIC > > > > BASE > > > > value. > > > > > > > > > > Contrary, can we depend on userspace VMM to set x2APIC in CPUID, but not do this > > > in KVM? If userspace doesn't do it, we treat it as userspace's bug. > > > > > > Plus, userspace anyway needs to set x2APIC in CPUID regardless whether you have > > > done above here, correct? > > > > > > I don't see the point of doing above in KVM because you are neither enforcing > > > anything in KVM, nor you are reducing effort of userspace. > > > > Good idea. I can drop cpuid part from tdx_vcpu_create() and apic base part from > > tdx_vcpu_reset(). It needs to modify tdx_has_emulated_msr() to allow user space > > VMM to update APIC BASE MSR. > > My personal preference would be: > > 1) In KVM_SET_CPUID2, we do sanity check of CPUIDs provided by userspace, and > return error if not met (i.e X2APIC isn't advertised). We already have cases > that KVM_SET_CPUID2 can fail, so extending to do TDX-specific check seems > reasonable to me too. This is moot. The current check does only check maxphys address bit size and specified xfeatures are supported by host. It's bare minimum for kvm to work. It doesn't try to check consistency. > 2) For APIC_BASE, you can just initialize the MSR in tdx_vcpu_reset() and ignore > any update (+pr_warn()?) to MSR_IA32_APIC_BASE. The x86 common code for KVM_CREATE_VCPU, kvm_arch_vcpu_create(), calls vcpu_create, creates lapic, and calls vcpu_reset(). Setting ACPI BASE MSR with X2APIC enabled, checks if cpuid x2apic bit is set. Please notice guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) in kvm_set_apic_base(). To work around it, one way is set cpuid artificially in create method as this patch does. Other way would be to introduce another version of kvm_set_apic_base() that doesn't check cpuid dedicated for this purpose. The third option is to make it user space responsibility to set initial reset value of APIC BASE MSR. Which option do you prefer? Thanks, > > > But Sean may have different opinion especially for the CPUID part. -- Isaku Yamahata <isaku.yamahata@gmail.com>
On Tue, 2023-02-28 at 16:35 -0800, Isaku Yamahata wrote: > On Tue, Feb 28, 2023 at 09:49:10PM +0000, > "Huang, Kai" <kai.huang@intel.com> wrote: > > > On Tue, 2023-02-28 at 12:18 -0800, Isaku Yamahata wrote: > > > On Tue, Feb 28, 2023 at 11:52:59AM +0000, > > > "Huang, Kai" <kai.huang@intel.com> wrote: > > > > > > > On Tue, 2023-02-28 at 03:06 -0800, Isaku Yamahata wrote: > > > > > > > + if (!e) > > > > > > > + return -ENOMEM; > > > > > > > + *e = (struct kvm_cpuid_entry2) { > > > > > > > + .function = 1, /* Features for X2APIC */ > > > > > > > + .index = 0, > > > > > > > + .eax = 0, > > > > > > > + .ebx = 0, > > > > > > > + .ecx = 1ULL << 21, /* X2APIC */ > > > > > > > + .edx = 0, > > > > > > > + }; > > > > > > > + vcpu->arch.cpuid_entries = e; > > > > > > > + vcpu->arch.cpuid_nent = 1; > > > > > > > > > > > > As mentioned above, why doing it here? Won't be this be overwritten later in > > > > > > KVM_SET_CPUID2? > > > > > > > > > > Yes, user space VMM can overwrite cpuid[0x1] and APIC base MSR. But it > > > > > doesn't > > > > > matter because it's a bug of user space VMM. user space VMM has to keep the > > > > > consistency of cpuid and MSRs. > > > > > Because TDX module virtualizes cpuid[0x1].x2apic to fixed 1, KVM value doesn't > > > > > matter after vcpu creation. > > > > > Because KVM virtualizes APIC base as read only to guest, cpuid[0x1].x2apic > > > > > doesn't matter after vcpu creation as long as user space VMM keeps KVM APIC > > > > > BASE > > > > > value. > > > > > > > > > > > > > Contrary, can we depend on userspace VMM to set x2APIC in CPUID, but not do this > > > > in KVM? If userspace doesn't do it, we treat it as userspace's bug. > > > > > > > > Plus, userspace anyway needs to set x2APIC in CPUID regardless whether you have > > > > done above here, correct? > > > > > > > > I don't see the point of doing above in KVM because you are neither enforcing > > > > anything in KVM, nor you are reducing effort of userspace. > > > > > > Good idea. I can drop cpuid part from tdx_vcpu_create() and apic base part from > > > tdx_vcpu_reset(). It needs to modify tdx_has_emulated_msr() to allow user space > > > VMM to update APIC BASE MSR. > > > > My personal preference would be: > > > > 1) In KVM_SET_CPUID2, we do sanity check of CPUIDs provided by userspace, and > > return error if not met (i.e X2APIC isn't advertised). We already have cases > > that KVM_SET_CPUID2 can fail, so extending to do TDX-specific check seems > > reasonable to me too. > > This is moot. The current check does only check maxphys address bit size and > specified xfeatures are supported by host. It's bare minimum for kvm to work. > It doesn't try to check consistency. > > > > 2) For APIC_BASE, you can just initialize the MSR in tdx_vcpu_reset() and ignore > > any update (+pr_warn()?) to MSR_IA32_APIC_BASE. > > The x86 common code for KVM_CREATE_VCPU, kvm_arch_vcpu_create(), calls vcpu_create, > creates lapic, and calls vcpu_reset(). > > Setting ACPI BASE MSR with X2APIC enabled, checks if cpuid x2apic bit is set. > Please notice guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) in kvm_set_apic_base(). > To work around it, one way is set cpuid artificially in create method as this > patch does. Other way would be to introduce another version of > kvm_set_apic_base() that doesn't check cpuid dedicated for this purpose. > The third option is to make it user space responsibility to set initial reset > value of APIC BASE MSR. > > Which option do you prefer? > I just recall you have already set all CPUIDs via tdx_td_init(). I would do below: 1) keep all CPUIDs in tdx_td_init(), and make vcpu->cpuid point to that. 2) Ignore KVM_SET_CPUID2 for TDX guest (+ pr_warn(), etc). 3) Set TDX-fixed CPU registers/msrs, etc in reset_vcpu().
On Wed, Mar 01, 2023 at 12:49:06AM +0000, "Huang, Kai" <kai.huang@intel.com> wrote: > On Tue, 2023-02-28 at 16:35 -0800, Isaku Yamahata wrote: > > On Tue, Feb 28, 2023 at 09:49:10PM +0000, > > "Huang, Kai" <kai.huang@intel.com> wrote: > > > > > On Tue, 2023-02-28 at 12:18 -0800, Isaku Yamahata wrote: > > > > On Tue, Feb 28, 2023 at 11:52:59AM +0000, > > > > "Huang, Kai" <kai.huang@intel.com> wrote: > > > > > > > > > On Tue, 2023-02-28 at 03:06 -0800, Isaku Yamahata wrote: > > > > > > > > + if (!e) > > > > > > > > + return -ENOMEM; > > > > > > > > + *e = (struct kvm_cpuid_entry2) { > > > > > > > > + .function = 1, /* Features for X2APIC */ > > > > > > > > + .index = 0, > > > > > > > > + .eax = 0, > > > > > > > > + .ebx = 0, > > > > > > > > + .ecx = 1ULL << 21, /* X2APIC */ > > > > > > > > + .edx = 0, > > > > > > > > + }; > > > > > > > > + vcpu->arch.cpuid_entries = e; > > > > > > > > + vcpu->arch.cpuid_nent = 1; > > > > > > > > > > > > > > As mentioned above, why doing it here? Won't be this be overwritten later in > > > > > > > KVM_SET_CPUID2? > > > > > > > > > > > > Yes, user space VMM can overwrite cpuid[0x1] and APIC base MSR. But it > > > > > > doesn't > > > > > > matter because it's a bug of user space VMM. user space VMM has to keep the > > > > > > consistency of cpuid and MSRs. > > > > > > Because TDX module virtualizes cpuid[0x1].x2apic to fixed 1, KVM value doesn't > > > > > > matter after vcpu creation. > > > > > > Because KVM virtualizes APIC base as read only to guest, cpuid[0x1].x2apic > > > > > > doesn't matter after vcpu creation as long as user space VMM keeps KVM APIC > > > > > > BASE > > > > > > value. > > > > > > > > > > > > > > > > Contrary, can we depend on userspace VMM to set x2APIC in CPUID, but not do this > > > > > in KVM? If userspace doesn't do it, we treat it as userspace's bug. > > > > > > > > > > Plus, userspace anyway needs to set x2APIC in CPUID regardless whether you have > > > > > done above here, correct? > > > > > > > > > > I don't see the point of doing above in KVM because you are neither enforcing > > > > > anything in KVM, nor you are reducing effort of userspace. > > > > > > > > Good idea. I can drop cpuid part from tdx_vcpu_create() and apic base part from > > > > tdx_vcpu_reset(). It needs to modify tdx_has_emulated_msr() to allow user space > > > > VMM to update APIC BASE MSR. > > > > > > My personal preference would be: > > > > > > 1) In KVM_SET_CPUID2, we do sanity check of CPUIDs provided by userspace, and > > > return error if not met (i.e X2APIC isn't advertised). We already have cases > > > that KVM_SET_CPUID2 can fail, so extending to do TDX-specific check seems > > > reasonable to me too. > > > > This is moot. The current check does only check maxphys address bit size and > > specified xfeatures are supported by host. It's bare minimum for kvm to work. > > It doesn't try to check consistency. > > > > > > > 2) For APIC_BASE, you can just initialize the MSR in tdx_vcpu_reset() and ignore > > > any update (+pr_warn()?) to MSR_IA32_APIC_BASE. > > > > The x86 common code for KVM_CREATE_VCPU, kvm_arch_vcpu_create(), calls vcpu_create, > > creates lapic, and calls vcpu_reset(). > > > > Setting ACPI BASE MSR with X2APIC enabled, checks if cpuid x2apic bit is set. > > Please notice guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) in kvm_set_apic_base(). > > To work around it, one way is set cpuid artificially in create method as this > > patch does. Other way would be to introduce another version of > > kvm_set_apic_base() that doesn't check cpuid dedicated for this purpose. > > The third option is to make it user space responsibility to set initial reset > > value of APIC BASE MSR. > > > > Which option do you prefer? > > > > I just recall you have already set all CPUIDs via tdx_td_init(). I would do > below: > > 1) keep all CPUIDs in tdx_td_init(), and make vcpu->cpuid point to that. > 2) Ignore KVM_SET_CPUID2 for TDX guest (+ pr_warn(), etc). > 3) Set TDX-fixed CPU registers/msrs, etc in reset_vcpu(). Finally I come up with the following flow. - KVM_CREATE_VCPU - tdx_vcpu_create() no cpuid, no msr operation - KVM_SET_CPUID2 user space has to set cpuid...x2apic=1 - KVM_TDX_INIT_VCPU tdx_vcpu_ioctl() sets APIC_BASE MSR to x2apic enabled. Here if user space VMM doesn't set cpuid properly, it results in error. ACPI_BASE MSR is read only for both user space VMM and guest TD. cpuid: After KVM_TDX_INIT_VCPU, user space VMM can update it by KVM_SET_CPUID2. KVM doesn't care. Guest TD see cpuid.x2apic value virtualized by TDX module while KVM internally doesn't use the value because APIC_BASE won't change. Thanks, -- Isaku Yamahata <isaku.yamahata@gmail.com>
On Thu, 12 Jan 2023 08:31:31 -0800 isaku.yamahata@intel.com wrote: > From: Isaku Yamahata <isaku.yamahata@intel.com> > > The next step of TDX guest creation is to create vcpu. Allocate TDX vcpu > structures, partially initialize it. Allocate pages of TDX vcpu for the > TDX module. Actual donation TDX vcpu pages to the TDX module is not done > yet. > > In the case of the conventional case, cpuid is empty at the initialization. > and cpuid is configured after the vcpu initialization. Because TDX > supports only X2APIC mode, cpuid is forcibly initialized to support X2APIC > on the vcpu initialization. > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > --- > Changes v10 -> v11: > - NULL check of kvmalloc_array() in tdx_vcpu_reset. Move it to > tdx_vcpu_create() > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > --- > arch/x86/kvm/vmx/main.c | 40 ++++++++++++++++++-- > arch/x86/kvm/vmx/tdx.c | 75 ++++++++++++++++++++++++++++++++++++++ > arch/x86/kvm/vmx/x86_ops.h | 10 +++++ > arch/x86/kvm/x86.c | 2 + > 4 files changed, 123 insertions(+), 4 deletions(-) > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c > index ddf0742f1f67..59813ca05f36 100644 > --- a/arch/x86/kvm/vmx/main.c > +++ b/arch/x86/kvm/vmx/main.c > @@ -63,6 +63,38 @@ static void vt_vm_free(struct kvm *kvm) > tdx_vm_free(kvm); > } > > +static int vt_vcpu_precreate(struct kvm *kvm) > +{ > + if (is_td(kvm)) > + return 0; > + > + return vmx_vcpu_precreate(kvm); > +} > + > +static int vt_vcpu_create(struct kvm_vcpu *vcpu) > +{ > + if (is_td_vcpu(vcpu)) > + return tdx_vcpu_create(vcpu); > + > + return vmx_vcpu_create(vcpu); > +} > + ----- > +static void vt_vcpu_free(struct kvm_vcpu *vcpu) > +{ > + if (is_td_vcpu(vcpu)) > + return tdx_vcpu_free(vcpu); > + > + return vmx_vcpu_free(vcpu); > +} > + > +static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > +{ > + if (is_td_vcpu(vcpu)) > + return tdx_vcpu_reset(vcpu, init_event); > + > + return vmx_vcpu_reset(vcpu, init_event); > +} > + ---- It seems a little strange to use return in this style. Would it be better like: ----- if (xxx) { tdx_vcpu_reset(xxx); return; } vmx_vcpu_reset(xxx); ---- ? > static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) > { > if (!is_td(kvm)) > @@ -90,10 +122,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = { > .vm_destroy = vt_vm_destroy, > .vm_free = vt_vm_free, > > - .vcpu_precreate = vmx_vcpu_precreate, > - .vcpu_create = vmx_vcpu_create, > - .vcpu_free = vmx_vcpu_free, > - .vcpu_reset = vmx_vcpu_reset, > + .vcpu_precreate = vt_vcpu_precreate, > + .vcpu_create = vt_vcpu_create, > + .vcpu_free = vt_vcpu_free, > + .vcpu_reset = vt_vcpu_reset, > > .prepare_switch_to_guest = vmx_prepare_switch_to_guest, > .vcpu_load = vmx_vcpu_load, > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > index 557a609c5147..099f0737a5aa 100644 > --- a/arch/x86/kvm/vmx/tdx.c > +++ b/arch/x86/kvm/vmx/tdx.c > @@ -281,6 +281,81 @@ int tdx_vm_init(struct kvm *kvm) > return 0; > } > > +int tdx_vcpu_create(struct kvm_vcpu *vcpu) > +{ > + struct kvm_cpuid_entry2 *e; > + > + /* > + * On cpu creation, cpuid entry is blank. Forcibly enable > + * X2APIC feature to allow X2APIC. > + * Because vcpu_reset() can't return error, allocation is done here. > + */ > + WARN_ON_ONCE(vcpu->arch.cpuid_entries); > + WARN_ON_ONCE(vcpu->arch.cpuid_nent); > + e = kvmalloc_array(1, sizeof(*e), GFP_KERNEL_ACCOUNT); > + if (!e) > + return -ENOMEM; > + *e = (struct kvm_cpuid_entry2) { > + .function = 1, /* Features for X2APIC */ > + .index = 0, > + .eax = 0, > + .ebx = 0, > + .ecx = 1ULL << 21, /* X2APIC */ > + .edx = 0, > + }; > + vcpu->arch.cpuid_entries = e; > + vcpu->arch.cpuid_nent = 1; > + > + /* TDX only supports x2APIC, which requires an in-kernel local APIC. */ > + if (!vcpu->arch.apic) > + return -EINVAL; > + > + fpstate_set_confidential(&vcpu->arch.guest_fpu); > + > + vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; > + > + vcpu->arch.cr0_guest_owned_bits = -1ul; > + vcpu->arch.cr4_guest_owned_bits = -1ul; > + > + vcpu->arch.tsc_offset = to_kvm_tdx(vcpu->kvm)->tsc_offset; > + vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; > + vcpu->arch.guest_state_protected = > + !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTRIBUTE_DEBUG); > + > + return 0; > +} > + > +void tdx_vcpu_free(struct kvm_vcpu *vcpu) > +{ > + /* This is stub for now. More logic will come. */ > +} > + > +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > +{ > + struct msr_data apic_base_msr; > + > + /* TDX doesn't support INIT event. */ > + if (WARN_ON_ONCE(init_event)) > + goto td_bugged; > + > + /* TDX rquires X2APIC. */ ^ requires > + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC; > + if (kvm_vcpu_is_reset_bsp(vcpu)) > + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; > + apic_base_msr.host_initiated = true; > + if (WARN_ON_ONCE(kvm_set_apic_base(vcpu, &apic_base_msr))) > + goto td_bugged; > + > + /* > + * Don't update mp_state to runnable because more initialization > + * is needed by TDX_VCPU_INIT. > + */ > + return; > + > +td_bugged: > + vcpu->kvm->vm_bugged = true; > +} > + 1) Using vm_bugged to terminate the VM creation feels off. When using it in creation path, the termination still happens in xx_vcpu_run(). Thus, even something wrong happens at a certain point of the creation path, the VM creation still continues. Until the xxx_vcpu_run(), the VM termination finally happens. Why not just fail in the creation path? 2) Move > + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC; > + if (kvm_vcpu_is_reset_bsp(vcpu)) > + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; > + apic_base_msr.host_initiated = true; to: void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) { struct kvm_lapic *apic = vcpu->arch.apic; u64 msr_val; int i; if (!init_event) { msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; /* here */ if (is_td_vcpu(vcpu)) msr_val = xxxx; if (kvm_vcpu_is_reset_bsp(vcpu)) msr_val |= MSR_IA32_APICBASE_BSP; kvm_lapic_set_base(vcpu, msr_val); } PS: Is there any reason that APIC MSR in TDX doesn't need MSR_IA32_APICBASE_ENABLE? 3) Change the following: > + > + /* TDX doesn't support INIT event. */ > + if (WARN_ON_ONCE(init_event)) > + goto td_bugged; > + to WARN_ON_ONCE(init_event); kvm_cpu_deliver_init() will trigger a kvm_vcpu_reset(xxx, init_event=true), but you have already avoided this in vt_vcpu_deliver_init(). A warn is good enough to remind people. With these changes, tdx_vcpu_reset() will only contain the CPUID configuration , using the vm_bugged to terminate the VM in tdx_vcpu_reset() can be removed. > int tdx_dev_ioctl(void __user *argp) > { > struct kvm_tdx_capabilities __user *user_caps; > diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h > index 6c40dda1cc2f..37ab2cfd35bc 100644 > --- a/arch/x86/kvm/vmx/x86_ops.h > +++ b/arch/x86/kvm/vmx/x86_ops.h > @@ -147,7 +147,12 @@ int tdx_offline_cpu(void); > int tdx_vm_init(struct kvm *kvm); > void tdx_mmu_release_hkid(struct kvm *kvm); > void tdx_vm_free(struct kvm *kvm); > + > int tdx_vm_ioctl(struct kvm *kvm, void __user *argp); > + > +int tdx_vcpu_create(struct kvm_vcpu *vcpu); > +void tdx_vcpu_free(struct kvm_vcpu *vcpu); > +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); > #else > static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return 0; } > static inline void tdx_hardware_unsetup(void) {} > @@ -159,7 +164,12 @@ static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; } > static inline void tdx_mmu_release_hkid(struct kvm *kvm) {} > static inline void tdx_flush_shadow_all_private(struct kvm *kvm) {} > static inline void tdx_vm_free(struct kvm *kvm) {} > + > static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; } > + > +static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } > +static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {} > +static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {} > #endif > > #endif /* __KVM_X86_VMX_X86_OPS_H */ > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > index 1fb135e0c98f..e8bc66031a1d 100644 > --- a/arch/x86/kvm/x86.c > +++ b/arch/x86/kvm/x86.c > @@ -492,6 +492,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > kvm_recalculate_apic_map(vcpu->kvm); > return 0; > } > +EXPORT_SYMBOL_GPL(kvm_set_apic_base); > > /* > * Handle a fault on a hardware virtualization (VMX or SVM) instruction. > @@ -12109,6 +12110,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) > { > return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; > } > +EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); > The symbols don't need to be exported with the changes mentioned above. > bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) > {
On Mon, Jan 16, 2023 at 12:46:06PM +0200, Zhi Wang <zhi.wang.linux@gmail.com> wrote: > On Thu, 12 Jan 2023 08:31:31 -0800 > isaku.yamahata@intel.com wrote: > > > From: Isaku Yamahata <isaku.yamahata@intel.com> > > > > The next step of TDX guest creation is to create vcpu. Allocate TDX vcpu > > structures, partially initialize it. Allocate pages of TDX vcpu for the > > TDX module. Actual donation TDX vcpu pages to the TDX module is not done > > yet. > > > > In the case of the conventional case, cpuid is empty at the initialization. > > and cpuid is configured after the vcpu initialization. Because TDX > > supports only X2APIC mode, cpuid is forcibly initialized to support X2APIC > > on the vcpu initialization. > > > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > > --- > > Changes v10 -> v11: > > - NULL check of kvmalloc_array() in tdx_vcpu_reset. Move it to > > tdx_vcpu_create() > > > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > > --- > > arch/x86/kvm/vmx/main.c | 40 ++++++++++++++++++-- > > arch/x86/kvm/vmx/tdx.c | 75 ++++++++++++++++++++++++++++++++++++++ > > arch/x86/kvm/vmx/x86_ops.h | 10 +++++ > > arch/x86/kvm/x86.c | 2 + > > 4 files changed, 123 insertions(+), 4 deletions(-) > > > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c > > index ddf0742f1f67..59813ca05f36 100644 > > --- a/arch/x86/kvm/vmx/main.c > > +++ b/arch/x86/kvm/vmx/main.c > > @@ -63,6 +63,38 @@ static void vt_vm_free(struct kvm *kvm) > > tdx_vm_free(kvm); > > } > > > > +static int vt_vcpu_precreate(struct kvm *kvm) > > +{ > > + if (is_td(kvm)) > > + return 0; > > + > > + return vmx_vcpu_precreate(kvm); > > +} > > + > > +static int vt_vcpu_create(struct kvm_vcpu *vcpu) > > +{ > > + if (is_td_vcpu(vcpu)) > > + return tdx_vcpu_create(vcpu); > > + > > + return vmx_vcpu_create(vcpu); > > +} > > + > > ----- > > +static void vt_vcpu_free(struct kvm_vcpu *vcpu) > > +{ > > + if (is_td_vcpu(vcpu)) > > + return tdx_vcpu_free(vcpu); > > + > > + return vmx_vcpu_free(vcpu); > > +} > > + > > +static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > > +{ > > + if (is_td_vcpu(vcpu)) > > + return tdx_vcpu_reset(vcpu, init_event); > > + > > + return vmx_vcpu_reset(vcpu, init_event); > > +} > > + > ---- > > It seems a little strange to use return in this style. Would it be better like: > > ----- > if (xxx) { > tdx_vcpu_reset(xxx); > return; > } > > vmx_vcpu_reset(xxx); > ---- > > ? It's C11. I updated the code to not use the feature. > > static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) > > { > > if (!is_td(kvm)) > > @@ -90,10 +122,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = { > > .vm_destroy = vt_vm_destroy, > > .vm_free = vt_vm_free, > > > > - .vcpu_precreate = vmx_vcpu_precreate, > > - .vcpu_create = vmx_vcpu_create, > > - .vcpu_free = vmx_vcpu_free, > > - .vcpu_reset = vmx_vcpu_reset, > > + .vcpu_precreate = vt_vcpu_precreate, > > + .vcpu_create = vt_vcpu_create, > > + .vcpu_free = vt_vcpu_free, > > + .vcpu_reset = vt_vcpu_reset, > > > > .prepare_switch_to_guest = vmx_prepare_switch_to_guest, > > .vcpu_load = vmx_vcpu_load, > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > > index 557a609c5147..099f0737a5aa 100644 > > --- a/arch/x86/kvm/vmx/tdx.c > > +++ b/arch/x86/kvm/vmx/tdx.c > > @@ -281,6 +281,81 @@ int tdx_vm_init(struct kvm *kvm) > > return 0; > > } > > > > +int tdx_vcpu_create(struct kvm_vcpu *vcpu) > > +{ > > + struct kvm_cpuid_entry2 *e; > > + > > + /* > > + * On cpu creation, cpuid entry is blank. Forcibly enable > > + * X2APIC feature to allow X2APIC. > > + * Because vcpu_reset() can't return error, allocation is done here. > > + */ > > + WARN_ON_ONCE(vcpu->arch.cpuid_entries); > > + WARN_ON_ONCE(vcpu->arch.cpuid_nent); > > + e = kvmalloc_array(1, sizeof(*e), GFP_KERNEL_ACCOUNT); > > + if (!e) > > + return -ENOMEM; > > + *e = (struct kvm_cpuid_entry2) { > > + .function = 1, /* Features for X2APIC */ > > + .index = 0, > > + .eax = 0, > > + .ebx = 0, > > + .ecx = 1ULL << 21, /* X2APIC */ > > + .edx = 0, > > + }; > > + vcpu->arch.cpuid_entries = e; > > + vcpu->arch.cpuid_nent = 1; > > + > > + /* TDX only supports x2APIC, which requires an in-kernel local APIC. */ > > + if (!vcpu->arch.apic) > > + return -EINVAL; > > + > > + fpstate_set_confidential(&vcpu->arch.guest_fpu); > > + > > + vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; > > + > > + vcpu->arch.cr0_guest_owned_bits = -1ul; > > + vcpu->arch.cr4_guest_owned_bits = -1ul; > > + > > + vcpu->arch.tsc_offset = to_kvm_tdx(vcpu->kvm)->tsc_offset; > > + vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; > > + vcpu->arch.guest_state_protected = > > + !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTRIBUTE_DEBUG); > > + > > + return 0; > > +} > > + > > +void tdx_vcpu_free(struct kvm_vcpu *vcpu) > > +{ > > + /* This is stub for now. More logic will come. */ > > +} > > + > > +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > > +{ > > + struct msr_data apic_base_msr; > > + > > + /* TDX doesn't support INIT event. */ > > + if (WARN_ON_ONCE(init_event)) > > + goto td_bugged; > > + > > + /* TDX rquires X2APIC. */ > ^ > requires > > + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC; > > + if (kvm_vcpu_is_reset_bsp(vcpu)) > > + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; > > + apic_base_msr.host_initiated = true; > > + if (WARN_ON_ONCE(kvm_set_apic_base(vcpu, &apic_base_msr))) > > + goto td_bugged; > > + > > + /* > > + * Don't update mp_state to runnable because more initialization > > + * is needed by TDX_VCPU_INIT. > > + */ > > + return; > > + > > +td_bugged: > > + vcpu->kvm->vm_bugged = true; > > +} > > + > > 1) Using vm_bugged to terminate the VM creation feels off. When > using it in creation path, the termination still happens in xx_vcpu_run(). > > Thus, even something wrong happens at a certain point of the creation path, > the VM creation still continues. Until the xxx_vcpu_run(), the VM termination > finally happens. > > Why not just fail in the creation path? I converted vm_bugged to KVM_BUG_ON. Because the td_bugged case shouldn't happen for TDX case, it's worthwhile for KVM_BUG_ON() > 2) Move > > > + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC; > > + if (kvm_vcpu_is_reset_bsp(vcpu)) > > + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; > > + apic_base_msr.host_initiated = true; > > to: > > void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) > { > struct kvm_lapic *apic = vcpu->arch.apic; > u64 msr_val; > int i; > > if (!init_event) { > msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; > > /* here */ > if (is_td_vcpu(vcpu)) > msr_val = xxxx; > if (kvm_vcpu_is_reset_bsp(vcpu)) > msr_val |= MSR_IA32_APICBASE_BSP; > kvm_lapic_set_base(vcpu, msr_val); > } No. Because I'm trying to contain is_td/is_td_vcpu in vmx specific and not use in common x86 code. > PS: Is there any reason that APIC MSR in TDX doesn't need > MSR_IA32_APICBASE_ENABLE? because LAPIC_MODE_X2APIC includes MSR_IA32_APICBASE_ENABLE. In lapic.h LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE, > 3) Change the following: > > > + > > + /* TDX doesn't support INIT event. */ > > + if (WARN_ON_ONCE(init_event)) > > + goto td_bugged; > > + > > to > WARN_ON_ONCE(init_event); > > kvm_cpu_deliver_init() will trigger a kvm_vcpu_reset(xxx, init_event=true), > but you have already avoided this in vt_vcpu_deliver_init(). A warn > is good enough to remind people. I converted it into KVM_BUG_ON(). > With these changes, tdx_vcpu_reset() will only contain the CPUID configuration > , using the vm_bugged to terminate the VM in tdx_vcpu_reset() can be removed. > > > int tdx_dev_ioctl(void __user *argp) > > { > > struct kvm_tdx_capabilities __user *user_caps; > > diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h > > index 6c40dda1cc2f..37ab2cfd35bc 100644 > > --- a/arch/x86/kvm/vmx/x86_ops.h > > +++ b/arch/x86/kvm/vmx/x86_ops.h > > @@ -147,7 +147,12 @@ int tdx_offline_cpu(void); > > int tdx_vm_init(struct kvm *kvm); > > void tdx_mmu_release_hkid(struct kvm *kvm); > > void tdx_vm_free(struct kvm *kvm); > > + > > int tdx_vm_ioctl(struct kvm *kvm, void __user *argp); > > + > > +int tdx_vcpu_create(struct kvm_vcpu *vcpu); > > +void tdx_vcpu_free(struct kvm_vcpu *vcpu); > > +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); > > #else > > static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return 0; } > > static inline void tdx_hardware_unsetup(void) {} > > @@ -159,7 +164,12 @@ static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; } > > static inline void tdx_mmu_release_hkid(struct kvm *kvm) {} > > static inline void tdx_flush_shadow_all_private(struct kvm *kvm) {} > > static inline void tdx_vm_free(struct kvm *kvm) {} > > + > > static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; } > > + > > +static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } > > +static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {} > > +static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {} > > #endif > > > > #endif /* __KVM_X86_VMX_X86_OPS_H */ > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > index 1fb135e0c98f..e8bc66031a1d 100644 > > --- a/arch/x86/kvm/x86.c > > +++ b/arch/x86/kvm/x86.c > > @@ -492,6 +492,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > > kvm_recalculate_apic_map(vcpu->kvm); > > return 0; > > } > > +EXPORT_SYMBOL_GPL(kvm_set_apic_base); > > > > /* > > * Handle a fault on a hardware virtualization (VMX or SVM) instruction. > > @@ -12109,6 +12110,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) > > { > > return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; > > } > > +EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); > > > > The symbols don't need to be exported with the changes mentioned above. > > > bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) > > { > -- Isaku Yamahata <isaku.yamahata@gmail.com>
On Mon, 27 Feb 2023 15:49:14 -0800 Isaku Yamahata <isaku.yamahata@gmail.com> wrote: > On Mon, Jan 16, 2023 at 12:46:06PM +0200, > Zhi Wang <zhi.wang.linux@gmail.com> wrote: > > > On Thu, 12 Jan 2023 08:31:31 -0800 > > isaku.yamahata@intel.com wrote: > > > > > From: Isaku Yamahata <isaku.yamahata@intel.com> > > > > > > The next step of TDX guest creation is to create vcpu. Allocate TDX vcpu > > > structures, partially initialize it. Allocate pages of TDX vcpu for the > > > TDX module. Actual donation TDX vcpu pages to the TDX module is not done > > > yet. > > > > > > In the case of the conventional case, cpuid is empty at the initialization. > > > and cpuid is configured after the vcpu initialization. Because TDX > > > supports only X2APIC mode, cpuid is forcibly initialized to support X2APIC > > > on the vcpu initialization. > > > > > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > > > --- > > > Changes v10 -> v11: > > > - NULL check of kvmalloc_array() in tdx_vcpu_reset. Move it to > > > tdx_vcpu_create() > > > > > > Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com> > > > --- > > > arch/x86/kvm/vmx/main.c | 40 ++++++++++++++++++-- > > > arch/x86/kvm/vmx/tdx.c | 75 ++++++++++++++++++++++++++++++++++++++ > > > arch/x86/kvm/vmx/x86_ops.h | 10 +++++ > > > arch/x86/kvm/x86.c | 2 + > > > 4 files changed, 123 insertions(+), 4 deletions(-) > > > > > > diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c > > > index ddf0742f1f67..59813ca05f36 100644 > > > --- a/arch/x86/kvm/vmx/main.c > > > +++ b/arch/x86/kvm/vmx/main.c > > > @@ -63,6 +63,38 @@ static void vt_vm_free(struct kvm *kvm) > > > tdx_vm_free(kvm); > > > } > > > > > > +static int vt_vcpu_precreate(struct kvm *kvm) > > > +{ > > > + if (is_td(kvm)) > > > + return 0; > > > + > > > + return vmx_vcpu_precreate(kvm); > > > +} > > > + > > > +static int vt_vcpu_create(struct kvm_vcpu *vcpu) > > > +{ > > > + if (is_td_vcpu(vcpu)) > > > + return tdx_vcpu_create(vcpu); > > > + > > > + return vmx_vcpu_create(vcpu); > > > +} > > > + > > > > ----- > > > +static void vt_vcpu_free(struct kvm_vcpu *vcpu) > > > +{ > > > + if (is_td_vcpu(vcpu)) > > > + return tdx_vcpu_free(vcpu); > > > + > > > + return vmx_vcpu_free(vcpu); > > > +} > > > + > > > +static void vt_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > > > +{ > > > + if (is_td_vcpu(vcpu)) > > > + return tdx_vcpu_reset(vcpu, init_event); > > > + > > > + return vmx_vcpu_reset(vcpu, init_event); > > > +} > > > + > > ---- > > > > It seems a little strange to use return in this style. Would it be better like: > > > > ----- > > if (xxx) { > > tdx_vcpu_reset(xxx); > > return; > > } > > > > vmx_vcpu_reset(xxx); > > ---- > > > > ? > > It's C11. I updated the code to not use the feature. > > > > > static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp) > > > { > > > if (!is_td(kvm)) > > > @@ -90,10 +122,10 @@ struct kvm_x86_ops vt_x86_ops __initdata = { > > > .vm_destroy = vt_vm_destroy, > > > .vm_free = vt_vm_free, > > > > > > - .vcpu_precreate = vmx_vcpu_precreate, > > > - .vcpu_create = vmx_vcpu_create, > > > - .vcpu_free = vmx_vcpu_free, > > > - .vcpu_reset = vmx_vcpu_reset, > > > + .vcpu_precreate = vt_vcpu_precreate, > > > + .vcpu_create = vt_vcpu_create, > > > + .vcpu_free = vt_vcpu_free, > > > + .vcpu_reset = vt_vcpu_reset, > > > > > > .prepare_switch_to_guest = vmx_prepare_switch_to_guest, > > > .vcpu_load = vmx_vcpu_load, > > > diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c > > > index 557a609c5147..099f0737a5aa 100644 > > > --- a/arch/x86/kvm/vmx/tdx.c > > > +++ b/arch/x86/kvm/vmx/tdx.c > > > @@ -281,6 +281,81 @@ int tdx_vm_init(struct kvm *kvm) > > > return 0; > > > } > > > > > > +int tdx_vcpu_create(struct kvm_vcpu *vcpu) > > > +{ > > > + struct kvm_cpuid_entry2 *e; > > > + > > > + /* > > > + * On cpu creation, cpuid entry is blank. Forcibly enable > > > + * X2APIC feature to allow X2APIC. > > > + * Because vcpu_reset() can't return error, allocation is done here. > > > + */ > > > + WARN_ON_ONCE(vcpu->arch.cpuid_entries); > > > + WARN_ON_ONCE(vcpu->arch.cpuid_nent); > > > + e = kvmalloc_array(1, sizeof(*e), GFP_KERNEL_ACCOUNT); > > > + if (!e) > > > + return -ENOMEM; > > > + *e = (struct kvm_cpuid_entry2) { > > > + .function = 1, /* Features for X2APIC */ > > > + .index = 0, > > > + .eax = 0, > > > + .ebx = 0, > > > + .ecx = 1ULL << 21, /* X2APIC */ > > > + .edx = 0, > > > + }; > > > + vcpu->arch.cpuid_entries = e; > > > + vcpu->arch.cpuid_nent = 1; > > > + > > > + /* TDX only supports x2APIC, which requires an in-kernel local APIC. */ > > > + if (!vcpu->arch.apic) > > > + return -EINVAL; > > > + > > > + fpstate_set_confidential(&vcpu->arch.guest_fpu); > > > + > > > + vcpu->arch.efer = EFER_SCE | EFER_LME | EFER_LMA | EFER_NX; > > > + > > > + vcpu->arch.cr0_guest_owned_bits = -1ul; > > > + vcpu->arch.cr4_guest_owned_bits = -1ul; > > > + > > > + vcpu->arch.tsc_offset = to_kvm_tdx(vcpu->kvm)->tsc_offset; > > > + vcpu->arch.l1_tsc_offset = vcpu->arch.tsc_offset; > > > + vcpu->arch.guest_state_protected = > > > + !(to_kvm_tdx(vcpu->kvm)->attributes & TDX_TD_ATTRIBUTE_DEBUG); > > > + > > > + return 0; > > > +} > > > + > > > +void tdx_vcpu_free(struct kvm_vcpu *vcpu) > > > +{ > > > + /* This is stub for now. More logic will come. */ > > > +} > > > + > > > +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) > > > +{ > > > + struct msr_data apic_base_msr; > > > + > > > + /* TDX doesn't support INIT event. */ > > > + if (WARN_ON_ONCE(init_event)) > > > + goto td_bugged; > > > + > > > + /* TDX rquires X2APIC. */ > > ^ > > requires > > > + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC; > > > + if (kvm_vcpu_is_reset_bsp(vcpu)) > > > + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; > > > + apic_base_msr.host_initiated = true; > > > + if (WARN_ON_ONCE(kvm_set_apic_base(vcpu, &apic_base_msr))) > > > + goto td_bugged; > > > + > > > + /* > > > + * Don't update mp_state to runnable because more initialization > > > + * is needed by TDX_VCPU_INIT. > > > + */ > > > + return; > > > + > > > +td_bugged: > > > + vcpu->kvm->vm_bugged = true; > > > +} > > > + > > > > 1) Using vm_bugged to terminate the VM creation feels off. When > > using it in creation path, the termination still happens in xx_vcpu_run(). > > > > Thus, even something wrong happens at a certain point of the creation path, > > the VM creation still continues. Until the xxx_vcpu_run(), the VM termination > > finally happens. > > > > Why not just fail in the creation path? > > I converted vm_bugged to KVM_BUG_ON. Because the td_bugged case shouldn't > happen for TDX case, it's worthwhile for KVM_BUG_ON() > > > > 2) Move > > > > > + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC; > > > + if (kvm_vcpu_is_reset_bsp(vcpu)) > > > + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; > > > + apic_base_msr.host_initiated = true; > > > > to: > > > > void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) > > { > > struct kvm_lapic *apic = vcpu->arch.apic; > > u64 msr_val; > > int i; > > > > if (!init_event) { > > msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; > > > > /* here */ > > if (is_td_vcpu(vcpu)) > > msr_val = xxxx; > > if (kvm_vcpu_is_reset_bsp(vcpu)) > > msr_val |= MSR_IA32_APICBASE_BSP; > > kvm_lapic_set_base(vcpu, msr_val); > > } > > No. Because I'm trying to contain is_td/is_td_vcpu in vmx specific and not use > in common x86 code. > I guess so. Centeralizing the initialization would be the nice and greatly improve the readablity of the code. Maybe adding a new callback in kvm x86_ops like .get_default_msr_val instead. > > > PS: Is there any reason that APIC MSR in TDX doesn't need > > MSR_IA32_APICBASE_ENABLE? > > because LAPIC_MODE_X2APIC includes MSR_IA32_APICBASE_ENABLE. > In lapic.h > LAPIC_MODE_X2APIC = MSR_IA32_APICBASE_ENABLE | X2APIC_ENABLE, > > > > > 3) Change the following: > > > > > + > > > + /* TDX doesn't support INIT event. */ > > > + if (WARN_ON_ONCE(init_event)) > > > + goto td_bugged; > > > + > > > > to > > WARN_ON_ONCE(init_event); > > > > kvm_cpu_deliver_init() will trigger a kvm_vcpu_reset(xxx, init_event=true), > > but you have already avoided this in vt_vcpu_deliver_init(). A warn > > is good enough to remind people. > > I converted it into KVM_BUG_ON(). > > > > With these changes, tdx_vcpu_reset() will only contain the CPUID configuration > > , using the vm_bugged to terminate the VM in tdx_vcpu_reset() can be removed. > > > > > int tdx_dev_ioctl(void __user *argp) > > > { > > > struct kvm_tdx_capabilities __user *user_caps; > > > diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h > > > index 6c40dda1cc2f..37ab2cfd35bc 100644 > > > --- a/arch/x86/kvm/vmx/x86_ops.h > > > +++ b/arch/x86/kvm/vmx/x86_ops.h > > > @@ -147,7 +147,12 @@ int tdx_offline_cpu(void); > > > int tdx_vm_init(struct kvm *kvm); > > > void tdx_mmu_release_hkid(struct kvm *kvm); > > > void tdx_vm_free(struct kvm *kvm); > > > + > > > int tdx_vm_ioctl(struct kvm *kvm, void __user *argp); > > > + > > > +int tdx_vcpu_create(struct kvm_vcpu *vcpu); > > > +void tdx_vcpu_free(struct kvm_vcpu *vcpu); > > > +void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event); > > > #else > > > static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return 0; } > > > static inline void tdx_hardware_unsetup(void) {} > > > @@ -159,7 +164,12 @@ static inline int tdx_vm_init(struct kvm *kvm) { return -EOPNOTSUPP; } > > > static inline void tdx_mmu_release_hkid(struct kvm *kvm) {} > > > static inline void tdx_flush_shadow_all_private(struct kvm *kvm) {} > > > static inline void tdx_vm_free(struct kvm *kvm) {} > > > + > > > static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOPNOTSUPP; } > > > + > > > +static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; } > > > +static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {} > > > +static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {} > > > #endif > > > > > > #endif /* __KVM_X86_VMX_X86_OPS_H */ > > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c > > > index 1fb135e0c98f..e8bc66031a1d 100644 > > > --- a/arch/x86/kvm/x86.c > > > +++ b/arch/x86/kvm/x86.c > > > @@ -492,6 +492,7 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) > > > kvm_recalculate_apic_map(vcpu->kvm); > > > return 0; > > > } > > > +EXPORT_SYMBOL_GPL(kvm_set_apic_base); > > > > > > /* > > > * Handle a fault on a hardware virtualization (VMX or SVM) instruction. > > > @@ -12109,6 +12110,7 @@ bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) > > > { > > > return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; > > > } > > > +EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp); > > > > > > > The symbols don't need to be exported with the changes mentioned above. > > > > > bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) > > > { > > >
On Tue, Feb 28, 2023 at 07:55:09PM +0200, Zhi Wang <zhi.wang.linux@gmail.com> wrote: > On Mon, 27 Feb 2023 15:49:14 -0800 > Isaku Yamahata <isaku.yamahata@gmail.com> wrote: > > > 2) Move > > > > > > > + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC; > > > > + if (kvm_vcpu_is_reset_bsp(vcpu)) > > > > + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; > > > > + apic_base_msr.host_initiated = true; > > > > > > to: > > > > > > void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) > > > { > > > struct kvm_lapic *apic = vcpu->arch.apic; > > > u64 msr_val; > > > int i; > > > > > > if (!init_event) { > > > msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; > > > > > > /* here */ > > > if (is_td_vcpu(vcpu)) > > > msr_val = xxxx; > > > if (kvm_vcpu_is_reset_bsp(vcpu)) > > > msr_val |= MSR_IA32_APICBASE_BSP; > > > kvm_lapic_set_base(vcpu, msr_val); > > > } > > > > No. Because I'm trying to contain is_td/is_td_vcpu in vmx specific and not use > > in common x86 code. > > > > I guess so. Centeralizing the initialization would be the nice and greatly > improve the readablity of the code. Maybe adding a new callback in kvm x86_ops > like .get_default_msr_val instead. Finally I can eliminate cpuid/APIC BASE MSR, and move it user space VMM, qemu. -- Isaku Yamahata <isaku.yamahata@gmail.com>
On Tue, 28 Feb 2023 12:20:31 -0800 Isaku Yamahata <isaku.yamahata@gmail.com> wrote: > On Tue, Feb 28, 2023 at 07:55:09PM +0200, > Zhi Wang <zhi.wang.linux@gmail.com> wrote: > > > On Mon, 27 Feb 2023 15:49:14 -0800 > > Isaku Yamahata <isaku.yamahata@gmail.com> wrote: > > > > > 2) Move > > > > > > > > > + apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC; > > > > > + if (kvm_vcpu_is_reset_bsp(vcpu)) > > > > > + apic_base_msr.data |= MSR_IA32_APICBASE_BSP; > > > > > + apic_base_msr.host_initiated = true; > > > > > > > > to: > > > > > > > > void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) > > > > { > > > > struct kvm_lapic *apic = vcpu->arch.apic; > > > > u64 msr_val; > > > > int i; > > > > > > > > if (!init_event) { > > > > msr_val = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE; > > > > > > > > /* here */ > > > > if (is_td_vcpu(vcpu)) > > > > msr_val = xxxx; > > > > if (kvm_vcpu_is_reset_bsp(vcpu)) > > > > msr_val |= MSR_IA32_APICBASE_BSP; > > > > kvm_lapic_set_base(vcpu, msr_val); > > > > } > > > > > > No. Because I'm trying to contain is_td/is_td_vcpu in vmx specific and not use > > > in common x86 code. > > > > > > > I guess so. Centeralizing the initialization would be the nice and greatly > > improve the readablity of the code. Maybe adding a new callback in kvm x86_ops > > like .get_default_msr_val instead. > > Finally I can eliminate cpuid/APIC BASE MSR, and move it user space VMM, qemu. Great to hear.
© 2016 - 2025 Red Hat, Inc.