KVM: X86: Reduce calls to vcpu_load

[PATCH] KVM: X86: Reduce calls to vcpu_load

Posted by Hao Peng 2 years, 5 months ago

From: Peng Hao <flyingpeng@tencent.com>

The call of vcpu_load/put takes about 1-2us. Each
kvm_arch_vcpu_create will call vcpu_load/put
to initialize some fields of vmcs, which can be
delayed until the call of vcpu_ioctl to process
this part of the vmcs field, which can reduce calls
to vcpu_load.

Signed-off-by: Peng Hao <flyingpeng@tencent.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 21 ++++++++++++++++-----
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9320019708f9..2f2dcd283788 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -984,6 +984,7 @@ struct kvm_vcpu_arch {
        /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
        bool l1tf_flush_l1d;

+       bool initialized;
        /* Host CPU on which VM-entry was most recently attempted */
        int last_vmentry_cpu;

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4fd08a5e0e98..a3671a54e850 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -317,7 +317,20 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
 u64 __read_mostly host_xcr0;

 static struct kmem_cache *x86_emulator_cache;
+static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz);

+static inline bool kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
+{
+       return vcpu->arch.initialized;
+}
+
+static void kvm_vcpu_initial_reset(struct kvm_vcpu *vcpu)
+{
+       kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
+       kvm_vcpu_reset(vcpu, false);
+       kvm_init_mmu(vcpu);
+       vcpu->arch.initialized = true;
+}
 /*
  * When called, it means the previous get/set msr reached an invalid msr.
  * Return true if we want to ignore/silent this failed msr access.
@@ -5647,6 +5660,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,

        vcpu_load(vcpu);

+       if (!kvm_vcpu_initialized(vcpu))
+               kvm_vcpu_initial_reset(vcpu);
+
        u.buffer = NULL;
        switch (ioctl) {
        case KVM_GET_LAPIC: {
@@ -11930,11 +11946,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
        vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
        kvm_xen_init_vcpu(vcpu);
        kvm_vcpu_mtrr_init(vcpu);
-       vcpu_load(vcpu);
-       kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
-       kvm_vcpu_reset(vcpu, false);
-       kvm_init_mmu(vcpu);
-       vcpu_put(vcpu);
        return 0;

 free_guest_fpu:
--
2.31.1

Re: [PATCH] KVM: X86: Reduce calls to vcpu_load

Posted by Xiaoyao Li 2 years, 5 months ago

On 9/6/2023 2:24 PM, Hao Peng wrote:
> From: Peng Hao <flyingpeng@tencent.com>
> 
> The call of vcpu_load/put takes about 1-2us. Each
> kvm_arch_vcpu_create will call vcpu_load/put
> to initialize some fields of vmcs, which can be
> delayed until the call of vcpu_ioctl to process
> this part of the vmcs field, which can reduce calls
> to vcpu_load.

what if no vcpu ioctl is called after vcpu creation?

And will the first (it was second before this patch) vcpu_load() becomes 
longer? have you measured it?

I don't think it worth the optimization unless a strong reason.

> Signed-off-by: Peng Hao <flyingpeng@tencent.com>
> ---
>   arch/x86/include/asm/kvm_host.h |  1 +
>   arch/x86/kvm/x86.c              | 21 ++++++++++++++++-----
>   2 files changed, 17 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 9320019708f9..2f2dcd283788 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -984,6 +984,7 @@ struct kvm_vcpu_arch {
>          /* Flush the L1 Data cache for L1TF mitigation on VMENTER */
>          bool l1tf_flush_l1d;
> 
> +       bool initialized;
>          /* Host CPU on which VM-entry was most recently attempted */
>          int last_vmentry_cpu;
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 4fd08a5e0e98..a3671a54e850 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -317,7 +317,20 @@ const struct kvm_stats_header kvm_vcpu_stats_header = {
>   u64 __read_mostly host_xcr0;
> 
>   static struct kmem_cache *x86_emulator_cache;
> +static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
> 
> +static inline bool kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
> +{
> +       return vcpu->arch.initialized;
> +}
> +
> +static void kvm_vcpu_initial_reset(struct kvm_vcpu *vcpu)
> +{
> +       kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
> +       kvm_vcpu_reset(vcpu, false);
> +       kvm_init_mmu(vcpu);
> +       vcpu->arch.initialized = true;
> +}
>   /*
>    * When called, it means the previous get/set msr reached an invalid msr.
>    * Return true if we want to ignore/silent this failed msr access.
> @@ -5647,6 +5660,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> 
>          vcpu_load(vcpu);
> 
> +       if (!kvm_vcpu_initialized(vcpu))
> +               kvm_vcpu_initial_reset(vcpu);
> +
>          u.buffer = NULL;
>          switch (ioctl) {
>          case KVM_GET_LAPIC: {
> @@ -11930,11 +11946,6 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
>          vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
>          kvm_xen_init_vcpu(vcpu);
>          kvm_vcpu_mtrr_init(vcpu);
> -       vcpu_load(vcpu);
> -       kvm_set_tsc_khz(vcpu, vcpu->kvm->arch.default_tsc_khz);
> -       kvm_vcpu_reset(vcpu, false);
> -       kvm_init_mmu(vcpu);
> -       vcpu_put(vcpu);
>          return 0;
> 
>   free_guest_fpu:
> --
> 2.31.1

Re: [PATCH] KVM: X86: Reduce calls to vcpu_load

Posted by Sean Christopherson 2 years, 5 months ago

On Wed, Sep 06, 2023, Xiaoyao Li wrote:
> On 9/6/2023 2:24 PM, Hao Peng wrote:
> > From: Peng Hao <flyingpeng@tencent.com>
> > 
> > The call of vcpu_load/put takes about 1-2us. Each
> > kvm_arch_vcpu_create will call vcpu_load/put
> > to initialize some fields of vmcs, which can be
> > delayed until the call of vcpu_ioctl to process
> > this part of the vmcs field, which can reduce calls
> > to vcpu_load.
> 
> what if no vcpu ioctl is called after vcpu creation?
> 
> And will the first (it was second before this patch) vcpu_load() becomes
> longer? have you measured it?

I don't think the first vcpu_load() becomes longer, this avoids an entire
load()+put() pair by doing the initialization in the first ioctl().

That said, the patch is obviously buggy, it hooks kvm_arch_vcpu_ioctl() instead
of kvm_vcpu_ioctl(), e.g. doing KVM_RUN, KVM_SET_SREGS, etc. will cause explosions.

It will also break the TSC synchronization logic in kvm_arch_vcpu_postcreate(),
which can "race" with ioctls() as the vCPU file descriptor is accessible by
userspace the instant it's installed into the fd tables, i.e. userspace doesn't
have to wait for KVM_CREATE_VCPU to complete.

And I gotta imagine there are other interactions I haven't thought of off the
top of my head, e.g. the vCPU is also reachable via kvm_for_each_vcpu().  All it
takes is one path that touches a lazily initialized field for this to fall apart.

> I don't think it worth the optimization unless a strong reason.

Yeah, this is a lot of subtle complexity to shave 1-2us.

Re: [PATCH] KVM: X86: Reduce calls to vcpu_load

Posted by Hao Peng 2 years, 5 months ago

On Thu, Sep 7, 2023 at 4:08 AM Sean Christopherson <seanjc@google.com> wrote:
>
> On Wed, Sep 06, 2023, Xiaoyao Li wrote:
> > On 9/6/2023 2:24 PM, Hao Peng wrote:
> > > From: Peng Hao <flyingpeng@tencent.com>
> > >
> > > The call of vcpu_load/put takes about 1-2us. Each
> > > kvm_arch_vcpu_create will call vcpu_load/put
> > > to initialize some fields of vmcs, which can be
> > > delayed until the call of vcpu_ioctl to process
> > > this part of the vmcs field, which can reduce calls
> > > to vcpu_load.
> >
> > what if no vcpu ioctl is called after vcpu creation?
> >
> > And will the first (it was second before this patch) vcpu_load() becomes
> > longer? have you measured it?
>
> I don't think the first vcpu_load() becomes longer, this avoids an entire
> load()+put() pair by doing the initialization in the first ioctl().
>
> That said, the patch is obviously buggy, it hooks kvm_arch_vcpu_ioctl() instead
> of kvm_vcpu_ioctl(), e.g. doing KVM_RUN, KVM_SET_SREGS, etc. will cause explosions.
>
> It will also break the TSC synchronization logic in kvm_arch_vcpu_postcreate(),
> which can "race" with ioctls() as the vCPU file descriptor is accessible by
> userspace the instant it's installed into the fd tables, i.e. userspace doesn't
> have to wait for KVM_CREATE_VCPU to complete.
>
It works when there are many cores. The hook point problem mentioned
above can still be adjusted,
but the tsc synchronization problem is difficult to deal with.
thanks.
> And I gotta imagine there are other interactions I haven't thought of off the
> top of my head, e.g. the vCPU is also reachable via kvm_for_each_vcpu().  All it
> takes is one path that touches a lazily initialized field for this to fall apart.
>
> > I don't think it worth the optimization unless a strong reason.
>
> Yeah, this is a lot of subtle complexity to shave 1-2us.