arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 21 ++++++++++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-)
From: Peng Hao <flyingpeng@tencent.com>
The call of vcpu_load/put takes about 1-2us. Each
kvm_arch_vcpu_create will call vcpu_load/put
to initialize some fields of vmcs, which can be
delayed until the call of vcpu_ioctl to process
this part of the vmcs field, which can reduce calls
to vcpu_load.
Signed-off-by: Peng Hao <flyingpeng@tencent.com>
---
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/kvm/x86.c | 21 ++++++++++++++++-----
2 files changed, 17 insertions(+), 5 deletions(-)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9320019708f9..2f2dcd283788 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -984,6 +984,7 @@ struct kvm_vcpu_arch {
/* Flush the L1 Data cache for L1TF mitigation on VMENTER */
bool l1tf_flush_l1d;
+ bool initialized;
/* Host CPU on which VM-entry was most recently attempted */
int last_vmentry_cpu;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4fd08a5e0e98..a3671a54e850 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -317,7 +317,20 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
u64 __read_mostly host_xcr0;
static struct kmem_cache *x86_emulator_cache;
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
+static inline bool kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.initialized;
+}
+
+static void kvm_vcpu_initial_reset(struct kvm_vcpu *vcpu)
+{
+ kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
+ kvm_vcpu_reset(vcpu, false);
+ kvm_init_mmu(vcpu);
+ vcpu->arch.initialized = true;
+}
/*
* When called, it means the previous get/set msr reached an invalid msr.
* Return true if we want to ignore/silent this failed msr access.
@@ -5647,6 +5660,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
vcpu_load(vcpu);
+ if (!kvm_vcpu_initialized(vcpu))
+ kvm_vcpu_initial_reset(vcpu);
+
u.buffer = NULL;
switch (ioctl) {
case KVM_GET_LAPIC: {
@@ -11930,11 +11946,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_xen_init_vcpu(vcpu);
kvm_vcpu_mtrr_init(vcpu);
- vcpu_load(vcpu);
- kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
- kvm_vcpu_reset(vcpu, false);
- kvm_init_mmu(vcpu);
- vcpu_put(vcpu);
return 0;
free_guest_fpu:
--
2.31.1
On 9/6/2023 2:24 PM, Hao Peng wrote:
> From: Peng Hao <flyingpeng@tencent.com>
>
> The call of vcpu_load/put takes about 1-2us. Each
> kvm_arch_vcpu_create will call vcpu_load/put
> to initialize some fields of vmcs, which can be
> delayed until the call of vcpu_ioctl to process
> this part of the vmcs field, which can reduce calls
> to vcpu_load.
what if no vcpu ioctl is called after vcpu creation?
And will the first (it was second before this patch) vcpu_load() becomes
longer? have you measured it?
I don't think it worth the optimization unless a strong reason.
> Signed-off-by: Peng Hao <flyingpeng@tencent.com>
> ---
> arch/x86/include/asm/kvm_host.h | 1 +
> arch/x86/kvm/x86.c | 21 ++++++++++++++++-----
> 2 files changed, 17 insertions(+), 5 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 9320019708f9..2f2dcd283788 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -984,6 +984,7 @@ struct kvm_vcpu_arch {
> /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
> bool l1tf_flush_l1d;
>
> + bool initialized;
> /* Host CPU on which VM-entry was most recently attempted */
> int last_vmentry_cpu;
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 4fd08a5e0e98..a3671a54e850 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -317,7 +317,20 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
> u64 __read_mostly host_xcr0;
>
> static struct kmem_cache *x86_emulator_cache;
> +static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
>
> +static inline bool kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
> +{
> + return vcpu->arch.initialized;
> +}
> +
> +static void kvm_vcpu_initial_reset(struct kvm_vcpu *vcpu)
> +{
> + kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
> + kvm_vcpu_reset(vcpu, false);
> + kvm_init_mmu(vcpu);
> + vcpu->arch.initialized = true;
> +}
> /*
> * When called, it means the previous get/set msr reached an invalid msr.
> * Return true if we want to ignore/silent this failed msr access.
> @@ -5647,6 +5660,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>
> vcpu_load(vcpu);
>
> + if (!kvm_vcpu_initialized(vcpu))
> + kvm_vcpu_initial_reset(vcpu);
> +
> u.buffer = NULL;
> switch (ioctl) {
> case KVM_GET_LAPIC: {
> @@ -11930,11 +11946,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
> kvm_xen_init_vcpu(vcpu);
> kvm_vcpu_mtrr_init(vcpu);
> - vcpu_load(vcpu);
> - kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
> - kvm_vcpu_reset(vcpu, false);
> - kvm_init_mmu(vcpu);
> - vcpu_put(vcpu);
> return 0;
>
> free_guest_fpu:
> --
> 2.31.1
On Wed, Sep 06, 2023, Xiaoyao Li wrote: > On 9/6/2023 2:24 PM, Hao Peng wrote: > > From: Peng Hao <flyingpeng@tencent.com> > > > > The call of vcpu_load/put takes about 1-2us. Each > > kvm_arch_vcpu_create will call vcpu_load/put > > to initialize some fields of vmcs, which can be > > delayed until the call of vcpu_ioctl to process > > this part of the vmcs field, which can reduce calls > > to vcpu_load. > > what if no vcpu ioctl is called after vcpu creation? > > And will the first (it was second before this patch) vcpu_load() becomes > longer? have you measured it? I don't think the first vcpu_load() becomes longer, this avoids an entire load()+put() pair by doing the initialization in the first ioctl(). That said, the patch is obviously buggy, it hooks kvm_arch_vcpu_ioctl() instead of kvm_vcpu_ioctl(), e.g. doing KVM_RUN, KVM_SET_SREGS, etc. will cause explosions. It will also break the TSC synchronization logic in kvm_arch_vcpu_postcreate(), which can "race" with ioctls() as the vCPU file descriptor is accessible by userspace the instant it's installed into the fd tables, i.e. userspace doesn't have to wait for KVM_CREATE_VCPU to complete. And I gotta imagine there are other interactions I haven't thought of off the top of my head, e.g. the vCPU is also reachable via kvm_for_each_vcpu(). All it takes is one path that touches a lazily initialized field for this to fall apart. > I don't think it worth the optimization unless a strong reason. Yeah, this is a lot of subtle complexity to shave 1-2us.
On Thu, Sep 7, 2023 at 4:08 AM Sean Christopherson <seanjc@google.com> wrote: > > On Wed, Sep 06, 2023, Xiaoyao Li wrote: > > On 9/6/2023 2:24 PM, Hao Peng wrote: > > > From: Peng Hao <flyingpeng@tencent.com> > > > > > > The call of vcpu_load/put takes about 1-2us. Each > > > kvm_arch_vcpu_create will call vcpu_load/put > > > to initialize some fields of vmcs, which can be > > > delayed until the call of vcpu_ioctl to process > > > this part of the vmcs field, which can reduce calls > > > to vcpu_load. > > > > what if no vcpu ioctl is called after vcpu creation? > > > > And will the first (it was second before this patch) vcpu_load() becomes > > longer? have you measured it? > > I don't think the first vcpu_load() becomes longer, this avoids an entire > load()+put() pair by doing the initialization in the first ioctl(). > > That said, the patch is obviously buggy, it hooks kvm_arch_vcpu_ioctl() instead > of kvm_vcpu_ioctl(), e.g. doing KVM_RUN, KVM_SET_SREGS, etc. will cause explosions. > > It will also break the TSC synchronization logic in kvm_arch_vcpu_postcreate(), > which can "race" with ioctls() as the vCPU file descriptor is accessible by > userspace the instant it's installed into the fd tables, i.e. userspace doesn't > have to wait for KVM_CREATE_VCPU to complete. > It works when there are many cores. The hook point problem mentioned above can still be adjusted, but the tsc synchronization problem is difficult to deal with. thanks. > And I gotta imagine there are other interactions I haven't thought of off the > top of my head, e.g. the vCPU is also reachable via kvm_for_each_vcpu(). All it > takes is one path that touches a lazily initialized field for this to fall apart. > > > I don't think it worth the optimization unless a strong reason. > > Yeah, this is a lot of subtle complexity to shave 1-2us.
© 2016 - 2026 Red Hat, Inc.