[PATCH] KVM: add support for AMD nested live migration

Paolo Bonzini posted 1 patch 3 years, 10 months ago
Test FreeBSD passed
Test docker-quick@centos7 passed
Test checkpatch passed
Test docker-mingw@fedora passed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20200624160608.383931-1-pbonzini@redhat.com
Maintainers: Paolo Bonzini <pbonzini@redhat.com>, Richard Henderson <rth@twiddle.net>, Marcelo Tosatti <mtosatti@redhat.com>, Eduardo Habkost <ehabkost@redhat.com>
target/i386/cpu.c     |  1 +
target/i386/cpu.h     |  5 +++++
target/i386/kvm.c     | 42 ++++++++++++++++++++++++++++++++++--------
target/i386/machine.c | 30 +++++++++++++++++++++++++++++-
4 files changed, 69 insertions(+), 9 deletions(-)
[PATCH] KVM: add support for AMD nested live migration
Posted by Paolo Bonzini 3 years, 10 months ago
Support for nested guest live migration is part of Linux 5.8, add the
corresponding code to QEMU.  The migration format consists of a few
flags, is an opaque 4k blob.

The blob is in VMCB format (the control area represents the L1 VMCB
control fields, the save area represents the pre-vmentry state; KVM does
not use the host save area since the AMD manual allows that) but QEMU
does not really care about that.  However, the flags need to be
copied to hflags/hflags2 and back.

In addition, support for retrieving and setting the AMD nested virtualization
states allows the L1 guest to be reset while running a nested guest, but
a small bug in CPU reset needs to be fixed for that to work.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 target/i386/cpu.c     |  1 +
 target/i386/cpu.h     |  5 +++++
 target/i386/kvm.c     | 42 ++++++++++++++++++++++++++++++++++--------
 target/i386/machine.c | 30 +++++++++++++++++++++++++++++-
 4 files changed, 69 insertions(+), 9 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 36cbd3d027..f1cbac2fb5 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -5987,6 +5987,7 @@ static void x86_cpu_reset(DeviceState *dev)
     /* init to reset state */
 
     env->hflags2 |= HF2_GIF_MASK;
+    env->hflags &= ~HF_GUEST_MASK;
 
     cpu_x86_update_cr0(env, 0x60000010);
     env->a20_mask = ~0x0;
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 7d77efd9e4..282f9ad35f 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -2118,6 +2118,11 @@ static inline bool cpu_has_vmx(CPUX86State *env)
     return env->features[FEAT_1_ECX] & CPUID_EXT_VMX;
 }
 
+static inline bool cpu_has_svm(CPUX86State *env)
+{
+    return env->features[FEAT_8000_0001_ECX] & CPUID_EXT3_SVM;
+}
+
 /*
  * In order for a vCPU to enter VMX operation it must have CR4.VMXE set.
  * Since it was set, CR4.VMXE must remain set as long as vCPU is in
diff --git a/target/i386/kvm.c b/target/i386/kvm.c
index 6adbff3d74..2b6b7443d2 100644
--- a/target/i386/kvm.c
+++ b/target/i386/kvm.c
@@ -1840,16 +1840,18 @@ int kvm_arch_init_vcpu(CPUState *cs)
     if (max_nested_state_len > 0) {
         assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
 
-        if (cpu_has_vmx(env)) {
+        if (cpu_has_vmx(env) || cpu_has_svm(env)) {
             struct kvm_vmx_nested_state_hdr *vmx_hdr;
 
             env->nested_state = g_malloc0(max_nested_state_len);
             env->nested_state->size = max_nested_state_len;
             env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
 
-            vmx_hdr = &env->nested_state->hdr.vmx;
-            vmx_hdr->vmxon_pa = -1ull;
-            vmx_hdr->vmcs12_pa = -1ull;
+            if (cpu_has_vmx(env)) {
+                    vmx_hdr = &env->nested_state->hdr.vmx;
+                    vmx_hdr->vmxon_pa = -1ull;
+                    vmx_hdr->vmcs12_pa = -1ull;
+            }
         }
     }
 
@@ -3873,6 +3875,20 @@ static int kvm_put_nested_state(X86CPU *cpu)
         return 0;
     }
 
+    /*
+     * Copy flags that are affected by reset from env->hflags and env->hflags2.
+     */
+    if (env->hflags & HF_GUEST_MASK) {
+        env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
+    } else {
+        env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
+    }
+    if (env->hflags2 & HF2_GIF_MASK) {
+        env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
+    } else {
+        env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
+    }
+
     assert(env->nested_state->size <= max_nested_state_len);
     return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
 }
@@ -3901,11 +3917,19 @@ static int kvm_get_nested_state(X86CPU *cpu)
         return ret;
     }
 
+    /*
+     * Copy flags that are affected by reset to env->hflags and env->hflags2.
+     */
     if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
         env->hflags |= HF_GUEST_MASK;
     } else {
         env->hflags &= ~HF_GUEST_MASK;
     }
+    if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
+        env->hflags2 |= HF2_GIF_MASK;
+    } else {
+        env->hflags2 &= ~HF2_GIF_MASK;
+    }
 
     return ret;
 }
@@ -3917,6 +3941,12 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
 
     assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
 
+    /* must be before kvm_put_nested_state so that EFER.SVME is set */
+    ret = kvm_put_sregs(x86_cpu);
+    if (ret < 0) {
+        return ret;
+    }
+
     if (level >= KVM_PUT_RESET_STATE) {
         ret = kvm_put_nested_state(x86_cpu);
         if (ret < 0) {
@@ -3950,10 +3980,6 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
     if (ret < 0) {
         return ret;
     }
-    ret = kvm_put_sregs(x86_cpu);
-    if (ret < 0) {
-        return ret;
-    }
     /* must be before kvm_put_msrs */
     ret = kvm_inject_mce_oldstyle(x86_cpu);
     if (ret < 0) {
diff --git a/target/i386/machine.c b/target/i386/machine.c
index 0c96531a56..8684a247c1 100644
--- a/target/i386/machine.c
+++ b/target/i386/machine.c
@@ -1071,13 +1071,40 @@ static const VMStateDescription vmstate_vmx_nested_state = {
     }
 };
 
+static bool svm_nested_state_needed(void *opaque)
+{
+    struct kvm_nested_state *nested_state = opaque;
+
+    /*
+     * HF2_GIF_MASK is relevant for non-guest mode but it is already
+     * serialized via hflags2.
+     */
+    return (nested_state->format == KVM_STATE_NESTED_FORMAT_SVM &&
+            nested_state->size > offsetof(struct kvm_nested_state, data));
+}
+
+static const VMStateDescription vmstate_svm_nested_state = {
+    .name = "cpu/kvm_nested_state/svm",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = svm_nested_state_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_U64(hdr.svm.vmcb_pa, struct kvm_nested_state),
+        VMSTATE_UINT8_ARRAY(data.svm[0].vmcb12,
+                            struct kvm_nested_state,
+                            KVM_STATE_NESTED_SVM_VMCB_SIZE),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static bool nested_state_needed(void *opaque)
 {
     X86CPU *cpu = opaque;
     CPUX86State *env = &cpu->env;
 
     return (env->nested_state &&
-            vmx_nested_state_needed(env->nested_state));
+            (vmx_nested_state_needed(env->nested_state) ||
+             svm_nested_state_needed(env->nested_state)));
 }
 
 static int nested_state_post_load(void *opaque, int version_id)
@@ -1139,6 +1166,7 @@ static const VMStateDescription vmstate_kvm_nested_state = {
     },
     .subsections = (const VMStateDescription*[]) {
         &vmstate_vmx_nested_state,
+        &vmstate_svm_nested_state,
         NULL
     }
 };
-- 
2.26.2


Re: [PATCH] KVM: add support for AMD nested live migration
Posted by Dr. David Alan Gilbert 3 years, 9 months ago
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> Support for nested guest live migration is part of Linux 5.8, add the
> corresponding code to QEMU.  The migration format consists of a few
> flags, is an opaque 4k blob.
> 
> The blob is in VMCB format (the control area represents the L1 VMCB
> control fields, the save area represents the pre-vmentry state; KVM does
> not use the host save area since the AMD manual allows that) but QEMU
> does not really care about that.  However, the flags need to be
> copied to hflags/hflags2 and back.
> 
> In addition, support for retrieving and setting the AMD nested virtualization
> states allows the L1 guest to be reset while running a nested guest, but
> a small bug in CPU reset needs to be fixed for that to work.
> 
> Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>

OK, I wont claim to follow the flag wrangling, but from a migration
point of view this makes sense to me.

One question below...

> ---
>  target/i386/cpu.c     |  1 +
>  target/i386/cpu.h     |  5 +++++
>  target/i386/kvm.c     | 42 ++++++++++++++++++++++++++++++++++--------
>  target/i386/machine.c | 30 +++++++++++++++++++++++++++++-
>  4 files changed, 69 insertions(+), 9 deletions(-)
> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index 36cbd3d027..f1cbac2fb5 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -5987,6 +5987,7 @@ static void x86_cpu_reset(DeviceState *dev)
>      /* init to reset state */
>  
>      env->hflags2 |= HF2_GIF_MASK;
> +    env->hflags &= ~HF_GUEST_MASK;
>  
>      cpu_x86_update_cr0(env, 0x60000010);
>      env->a20_mask = ~0x0;
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 7d77efd9e4..282f9ad35f 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -2118,6 +2118,11 @@ static inline bool cpu_has_vmx(CPUX86State *env)
>      return env->features[FEAT_1_ECX] & CPUID_EXT_VMX;
>  }
>  
> +static inline bool cpu_has_svm(CPUX86State *env)
> +{
> +    return env->features[FEAT_8000_0001_ECX] & CPUID_EXT3_SVM;
> +}
> +
>  /*
>   * In order for a vCPU to enter VMX operation it must have CR4.VMXE set.
>   * Since it was set, CR4.VMXE must remain set as long as vCPU is in
> diff --git a/target/i386/kvm.c b/target/i386/kvm.c
> index 6adbff3d74..2b6b7443d2 100644
> --- a/target/i386/kvm.c
> +++ b/target/i386/kvm.c
> @@ -1840,16 +1840,18 @@ int kvm_arch_init_vcpu(CPUState *cs)
>      if (max_nested_state_len > 0) {
>          assert(max_nested_state_len >= offsetof(struct kvm_nested_state, data));
>  
> -        if (cpu_has_vmx(env)) {
> +        if (cpu_has_vmx(env) || cpu_has_svm(env)) {
>              struct kvm_vmx_nested_state_hdr *vmx_hdr;
>  
>              env->nested_state = g_malloc0(max_nested_state_len);
>              env->nested_state->size = max_nested_state_len;
>              env->nested_state->format = KVM_STATE_NESTED_FORMAT_VMX;
>  
> -            vmx_hdr = &env->nested_state->hdr.vmx;
> -            vmx_hdr->vmxon_pa = -1ull;
> -            vmx_hdr->vmcs12_pa = -1ull;
> +            if (cpu_has_vmx(env)) {
> +                    vmx_hdr = &env->nested_state->hdr.vmx;
> +                    vmx_hdr->vmxon_pa = -1ull;
> +                    vmx_hdr->vmcs12_pa = -1ull;
> +            }
>          }
>      }
>  
> @@ -3873,6 +3875,20 @@ static int kvm_put_nested_state(X86CPU *cpu)
>          return 0;
>      }
>  
> +    /*
> +     * Copy flags that are affected by reset from env->hflags and env->hflags2.
> +     */
> +    if (env->hflags & HF_GUEST_MASK) {
> +        env->nested_state->flags |= KVM_STATE_NESTED_GUEST_MODE;
> +    } else {
> +        env->nested_state->flags &= ~KVM_STATE_NESTED_GUEST_MODE;
> +    }
> +    if (env->hflags2 & HF2_GIF_MASK) {
> +        env->nested_state->flags |= KVM_STATE_NESTED_GIF_SET;
> +    } else {
> +        env->nested_state->flags &= ~KVM_STATE_NESTED_GIF_SET;
> +    }
> +
>      assert(env->nested_state->size <= max_nested_state_len);
>      return kvm_vcpu_ioctl(CPU(cpu), KVM_SET_NESTED_STATE, env->nested_state);
>  }
> @@ -3901,11 +3917,19 @@ static int kvm_get_nested_state(X86CPU *cpu)
>          return ret;
>      }
>  
> +    /*
> +     * Copy flags that are affected by reset to env->hflags and env->hflags2.
> +     */
>      if (env->nested_state->flags & KVM_STATE_NESTED_GUEST_MODE) {
>          env->hflags |= HF_GUEST_MASK;
>      } else {
>          env->hflags &= ~HF_GUEST_MASK;
>      }
> +    if (env->nested_state->flags & KVM_STATE_NESTED_GIF_SET) {
> +        env->hflags2 |= HF2_GIF_MASK;
> +    } else {
> +        env->hflags2 &= ~HF2_GIF_MASK;
> +    }
>  
>      return ret;
>  }
> @@ -3917,6 +3941,12 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
>  
>      assert(cpu_is_stopped(cpu) || qemu_cpu_is_self(cpu));
>  
> +    /* must be before kvm_put_nested_state so that EFER.SVME is set */
> +    ret = kvm_put_sregs(x86_cpu);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
>      if (level >= KVM_PUT_RESET_STATE) {
>          ret = kvm_put_nested_state(x86_cpu);
>          if (ret < 0) {
> @@ -3950,10 +3980,6 @@ int kvm_arch_put_registers(CPUState *cpu, int level)
>      if (ret < 0) {
>          return ret;
>      }
> -    ret = kvm_put_sregs(x86_cpu);
> -    if (ret < 0) {
> -        return ret;
> -    }
>      /* must be before kvm_put_msrs */
>      ret = kvm_inject_mce_oldstyle(x86_cpu);
>      if (ret < 0) {
> diff --git a/target/i386/machine.c b/target/i386/machine.c
> index 0c96531a56..8684a247c1 100644
> --- a/target/i386/machine.c
> +++ b/target/i386/machine.c
> @@ -1071,13 +1071,40 @@ static const VMStateDescription vmstate_vmx_nested_state = {
>      }
>  };
>  
> +static bool svm_nested_state_needed(void *opaque)
> +{
> +    struct kvm_nested_state *nested_state = opaque;
> +
> +    /*
> +     * HF2_GIF_MASK is relevant for non-guest mode but it is already
> +     * serialized via hflags2.
> +     */
> +    return (nested_state->format == KVM_STATE_NESTED_FORMAT_SVM &&
> +            nested_state->size > offsetof(struct kvm_nested_state, data));

How does this nested_state->size work? It looks like even if it's 1 byte
into 'data' we transmit a whole KVM_STATE_NESTED_SVM_VMCB_SIZE.

Dave

> +}
> +
> +static const VMStateDescription vmstate_svm_nested_state = {
> +    .name = "cpu/kvm_nested_state/svm",
> +    .version_id = 1,
> +    .minimum_version_id = 1,
> +    .needed = svm_nested_state_needed,
> +    .fields = (VMStateField[]) {
> +        VMSTATE_U64(hdr.svm.vmcb_pa, struct kvm_nested_state),
> +        VMSTATE_UINT8_ARRAY(data.svm[0].vmcb12,
> +                            struct kvm_nested_state,
> +                            KVM_STATE_NESTED_SVM_VMCB_SIZE),
> +        VMSTATE_END_OF_LIST()
> +    }
> +};
> +
>  static bool nested_state_needed(void *opaque)
>  {
>      X86CPU *cpu = opaque;
>      CPUX86State *env = &cpu->env;
>  
>      return (env->nested_state &&
> -            vmx_nested_state_needed(env->nested_state));
> +            (vmx_nested_state_needed(env->nested_state) ||
> +             svm_nested_state_needed(env->nested_state)));
>  }
>  
>  static int nested_state_post_load(void *opaque, int version_id)
> @@ -1139,6 +1166,7 @@ static const VMStateDescription vmstate_kvm_nested_state = {
>      },
>      .subsections = (const VMStateDescription*[]) {
>          &vmstate_vmx_nested_state,
> +        &vmstate_svm_nested_state,
>          NULL
>      }
>  };
> -- 
> 2.26.2
> 
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK


Re: [PATCH] KVM: add support for AMD nested live migration
Posted by Paolo Bonzini 3 years, 9 months ago
Il mer 1 lug 2020, 18:28 Dr. David Alan Gilbert <dgilbert@redhat.com> ha
scritto:

>
> > +    return (nested_state->format == KVM_STATE_NESTED_FORMAT_SVM &&
> > +            nested_state->size > offsetof(struct kvm_nested_state,
> data));
>
> How does this nested_state->size work? It looks like even if it's 1 byte
> into 'data' we transmit a whole KVM_STATE_NESTED_SVM_VMCB_SIZE.
>

The format is documented as having either 0 or KVM_STATE_NESTED_SVM_VMCB_SIZE
bytes (for vmx instead it can be 0, 4K or 8K).

Paolo


> Dave
>
> > +}
> > +
> > +static const VMStateDescription vmstate_svm_nested_state = {
> > +    .name = "cpu/kvm_nested_state/svm",
> > +    .version_id = 1,
> > +    .minimum_version_id = 1,
> > +    .needed = svm_nested_state_needed,
> > +    .fields = (VMStateField[]) {
> > +        VMSTATE_U64(hdr.svm.vmcb_pa, struct kvm_nested_state),
> > +        VMSTATE_UINT8_ARRAY(data.svm[0].vmcb12,
> > +                            struct kvm_nested_state,
> > +                            KVM_STATE_NESTED_SVM_VMCB_SIZE),
> > +        VMSTATE_END_OF_LIST()
> > +    }
> > +};
> > +
> >  static bool nested_state_needed(void *opaque)
> >  {
> >      X86CPU *cpu = opaque;
> >      CPUX86State *env = &cpu->env;
> >
> >      return (env->nested_state &&
> > -            vmx_nested_state_needed(env->nested_state));
> > +            (vmx_nested_state_needed(env->nested_state) ||
> > +             svm_nested_state_needed(env->nested_state)));
> >  }
> >
> >  static int nested_state_post_load(void *opaque, int version_id)
> > @@ -1139,6 +1166,7 @@ static const VMStateDescription
> vmstate_kvm_nested_state = {
> >      },
> >      .subsections = (const VMStateDescription*[]) {
> >          &vmstate_vmx_nested_state,
> > +        &vmstate_svm_nested_state,
> >          NULL
> >      }
> >  };
> > --
> > 2.26.2
> >
> --
> Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
>
>
Re: [PATCH] KVM: add support for AMD nested live migration
Posted by Dr. David Alan Gilbert 3 years, 9 months ago
* Paolo Bonzini (pbonzini@redhat.com) wrote:
> Il mer 1 lug 2020, 18:28 Dr. David Alan Gilbert <dgilbert@redhat.com> ha
> scritto:
> 
> >
> > > +    return (nested_state->format == KVM_STATE_NESTED_FORMAT_SVM &&
> > > +            nested_state->size > offsetof(struct kvm_nested_state,
> > data));
> >
> > How does this nested_state->size work? It looks like even if it's 1 byte
> > into 'data' we transmit a whole KVM_STATE_NESTED_SVM_VMCB_SIZE.
> >
> 
> The format is documented as having either 0 or KVM_STATE_NESTED_SVM_VMCB_SIZE
> bytes (for vmx instead it can be 0, 4K or 8K).

OK.

Dave

> Paolo
> 
> 
> > Dave
> >
> > > +}
> > > +
> > > +static const VMStateDescription vmstate_svm_nested_state = {
> > > +    .name = "cpu/kvm_nested_state/svm",
> > > +    .version_id = 1,
> > > +    .minimum_version_id = 1,
> > > +    .needed = svm_nested_state_needed,
> > > +    .fields = (VMStateField[]) {
> > > +        VMSTATE_U64(hdr.svm.vmcb_pa, struct kvm_nested_state),
> > > +        VMSTATE_UINT8_ARRAY(data.svm[0].vmcb12,
> > > +                            struct kvm_nested_state,
> > > +                            KVM_STATE_NESTED_SVM_VMCB_SIZE),
> > > +        VMSTATE_END_OF_LIST()
> > > +    }
> > > +};
> > > +
> > >  static bool nested_state_needed(void *opaque)
> > >  {
> > >      X86CPU *cpu = opaque;
> > >      CPUX86State *env = &cpu->env;
> > >
> > >      return (env->nested_state &&
> > > -            vmx_nested_state_needed(env->nested_state));
> > > +            (vmx_nested_state_needed(env->nested_state) ||
> > > +             svm_nested_state_needed(env->nested_state)));
> > >  }
> > >
> > >  static int nested_state_post_load(void *opaque, int version_id)
> > > @@ -1139,6 +1166,7 @@ static const VMStateDescription
> > vmstate_kvm_nested_state = {
> > >      },
> > >      .subsections = (const VMStateDescription*[]) {
> > >          &vmstate_vmx_nested_state,
> > > +        &vmstate_svm_nested_state,
> > >          NULL
> > >      }
> > >  };
> > > --
> > > 2.26.2
> > >
> > --
> > Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK
> >
> >
--
Dr. David Alan Gilbert / dgilbert@redhat.com / Manchester, UK